Build branch fix-integration-tests with version dev (2dbe3b72)

Build pipeline: vsh-ci-dev-k8tz4 Source commit: 2dbe3b7231 Source message: Fix pointers to test resources
2024-10-17 17:56:12 +00:00
commit cd0af18851
2125 changed files with 1018836 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+src/mapping/bd_rhapsody*/*.cwl linguist-generated
+src/query/cellxgene_census linguist-generated
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,40 @@
+# Jupyter notebooks
+.ipynb_checkpoints
+
+# pycache
+*__pycache__*
+.nfs*
+
+# R related
+.Rhistory
+*.Rproj
+.Rproj.user
+
+# Python virtual environments
+.venv
+
+# temporary files related
+temp
+
+# NextFlow
+work/
+.nextflow.log
+flowchart.*
+.nextflow*
+out/ 
+
+# Macos
+.DS_Store
+
+# viash related
+.viash_log*
+log.txt
+check_results/
+out/
+output*
+output_log/
+resources_test
+/viash_tools/
+
+# vscode
+.vscode/launch.json
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,626 @@
+[MAIN]
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Load and enable all available extensions. Use --list-extensions to see a list
+# all available extensions.
+#enable-all-extensions=
+
+# In error mode, messages with a category besides ERROR or FATAL are
+# suppressed, and no reports are done by default. Error mode is compatible with
+# disabling specific errors.
+#errors-only=
+
+# Always return a 0 (non-error) status code, even if lint errors are found.
+# This is primarily useful in continuous integration scripts.
+#exit-zero=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+extension-pkg-whitelist=
+
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+fail-on=
+
+# Specify a score threshold under which the program will exit with error.
+fail-under=10
+
+# Interpret the stdin as a python script, whose filename needs to be passed as
+# the module_or_package argument.
+#from-stdin=
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=CVS
+
+# Add files or directories matching the regular expressions patterns to the
+# ignore-list. The regex matches against paths and can be in Posix or Windows
+# format. Because '\' represents the directory delimiter on Windows systems, it
+# can't be used as an escape character.
+ignore-paths=
+
+# Files or directories matching the regular expression patterns are skipped.
+# The regex matches against base names, not paths. The default value ignores
+# Emacs file locks
+ignore-patterns=^\.#
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+py-version=3.10
+
+# Discover python modules and packages in the file system subtree.
+recursive=no
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# In verbose mode, extra non-checker-related info will be displayed.
+#verbose=
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style. If left empty, argument names will be checked with the set
+# naming style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
+#class-attribute-rgx=
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style. If left empty, class constant names will be checked with
+# the set naming style.
+#class-const-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style. If left empty, class names will be checked with the set naming style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style. If left empty, function names will be checked with the set
+# naming style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           ex,
+           Run,
+           _
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style. If left empty, method names will be checked with the set naming style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style. If left empty, module names will be checked with the set naming style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct type variable names. If left empty, type
+# variable names will be checked with the set naming style.
+#typevar-rgx=
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style. If left empty, variable names will be checked with the set
+# naming style.
+#variable-rgx=
+
+
+[CLASSES]
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[DESIGN]
+
+# List of regular expressions of class ancestor names to ignore when counting
+# public methods (see R0903)
+exclude-too-few-public-methods=
+
+# List of qualified class names to ignore when counting class parents (see
+# R0901)
+ignored-parents=
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when caught.
+overgeneral-exceptions=BaseException,
+                       Exception
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=
+
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
+ext-import-graph=
+
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
+import-graph=
+
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
+# UNDEFINED.
+confidence=HIGH,
+           CONTROL_FLOW,
+           INFERENCE,
+           INFERENCE_FAILURE,
+           UNDEFINED
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        line-too-long,
+        missing-module-docstring,
+        redefined-outer-name
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[METHOD_ARGS]
+
+# List of qualified names (i.e., library.method) which require a timeout
+# parameter e.g. 'requests.api.get,requests.api.post'
+timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+
+# Regular expression of note tags to take in consideration.
+notes-rgx=
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit,argparse.parse_error
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+#output-format=
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[SIMILARITIES]
+
+# Comments are removed from the similarity computation
+ignore-comments=yes
+
+# Docstrings are removed from the similarity computation
+ignore-docstrings=yes
+
+# Imports are removed from the similarity computation
+ignore-imports=yes
+
+# Signatures are removed from the similarity computation
+ignore-signatures=yes
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. Available dictionaries: en_AG (hunspell), en_AU
+# (hunspell), en_BS (hunspell), en_BW (hunspell), en_BZ (hunspell), en_CA
+# (hunspell), en_DK (hunspell), en_GB (hunspell), en_GH (hunspell), en_HK
+# (hunspell), en_IE (hunspell), en_IN (hunspell), en_JM (hunspell), en_MW
+# (hunspell), en_NA (hunspell), en_NG (hunspell), en_NZ (hunspell), en_PH
+# (hunspell), en_SG (hunspell), en_TT (hunspell), en_US (hunspell), en_ZA
+# (hunspell), en_ZM (hunspell), en_ZW (hunspell).
+spelling-dict=
+
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of symbolic message names to ignore for Mixin members.
+ignored-checks-for-mixins=no-member,
+                          not-async-context-manager,
+                          not-context-manager,
+                          attribute-defined-outside-init
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# Regex pattern to define which classes are considered mixins.
+mixin-class-rgx=.*[Mm]ixin
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of names allowed to shadow builtins
+allowed-redefined-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,132 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+[INSERT CONTACT METHOD].
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 OpenPipelines
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,13 @@
+OpenPipeline
+================
+
+<!-- README.md is generated by running 'quarto render README.qmd' -->
+
+Extensible single cell analysis pipelines for reproducible and
+large-scale single cell processing using Viash and Nextflow.
+
+The provided pipelines are built using the [Viash
+framework](http://www.viash.io) on top of the nextflow workflow system.
+For more information on Nextflow please visit the [Nextflow github
+page](https://github.com/nextflow-io/nextflow) and the [Nextflow read
+the docs page](https://www.nextflow.io/docs/latest/index.html).
--- a/README.qmd
+++ b/README.qmd
@@ -0,0 +1,18 @@
+---
+title: OpenPipeline
+format: gfm
+---
+
+<!-- README.md is generated by running 'quarto render README.qmd' -->
+
+```{r, echo = FALSE, message = FALSE, error = FALSE, warning = FALSE}
+library(tidyverse)
+```
+
+Extensible single cell analysis pipelines for reproducible and large-scale single cell processing using Viash and Nextflow. 
+
+The provided pipelines are built using the [Viash framework](http://www.viash.io) on top of the 
+nextflow workflow system. For more information on Nextflow please visit the [Nextflow github page](https://github.com/nextflow-io/nextflow) 
+and the [Nextflow read the docs page](https://www.nextflow.io/docs/latest/index.html).
+
+
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -0,0 +1,29 @@
+viash_version: 0.9.0
+
+version: dev
+
+source: src
+target: target
+
+# Note: this causes the docker images to be renamed
+name: openpipeline
+organization: openpipelines-bio
+
+links:
+  repository: https://github.com/openpipelines-bio/openpipeline
+  docker_registry: ghcr.io
+  homepage: https://openpipelines.bio
+  documentation: https://openpipelines.bio/fundamentals
+  issue_tracker: https://github.com/openpipelines-bio/openpipeline/issues
+
+info:
+  test_resources:
+    - type: s3
+      path: s3://openpipelines-data
+      dest: resources_test
+
+config_mods: |
+  .test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}
+  .resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}
+  .runners[.type == 'nextflow'].directives.tag := '$id'
+  .runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
--- a/images/concepts/fig.svg
+++ b/images/concepts/fig.svg
--- a/images/concepts/fig_cell.svg
+++ b/images/concepts/fig_cell.svg
--- a/images/concepts/fig_modality_adt.svg
+++ b/images/concepts/fig_modality_adt.svg
--- a/images/concepts/fig_modality_atac.svg
+++ b/images/concepts/fig_modality_atac.svg
--- a/images/concepts/fig_modality_rna.svg
+++ b/images/concepts/fig_modality_rna.svg
--- a/images/concepts/fig_modality_vdj.svg
+++ b/images/concepts/fig_modality_vdj.svg
--- a/images/concepts/fig_workflow_multiomics_adt_multisample.svg
+++ b/images/concepts/fig_workflow_multiomics_adt_multisample.svg
--- a/images/concepts/fig_workflow_multiomics_adt_singlesample.svg
+++ b/images/concepts/fig_workflow_multiomics_adt_singlesample.svg
--- a/images/concepts/fig_workflow_multiomics_rna_multisample.svg
+++ b/images/concepts/fig_workflow_multiomics_rna_multisample.svg
--- a/images/concepts/fig_workflow_multiomics_rna_singlesample.svg
+++ b/images/concepts/fig_workflow_multiomics_rna_singlesample.svg
--- a/images/concepts/generate_subimages.sh
+++ b/images/concepts/generate_subimages.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# so let's do it separately
+rm images/concepts/fig_*.svg
+
+for id in cell modality_rna modality_adt modality_vdj modality_atac workflow_multiomics_rna_singlesample workflow_multiomics_rna_multisample workflow_multiomics_adt_singlesample workflow_multiomics_adt_multisample; do
+  inkscape --export-type="svg" --export-id="$id" --export-id-only images/concepts/fig.svg
+  svgo images/concepts/fig_${id}.svg
+done
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,5 @@
+nextflow.enable.dsl=2
+
+workflow {
+    print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.")
+}
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,22 @@
+// template nextflow.config for nested workflows
+
+manifest {
+  nextflowVersion = '!>=20.12.1-edge'
+}
+
+// TODO 1: unquote and adapt `rootDir` according to relative path within project
+// params {
+//   rootDir = "$projectDir/../.."
+// }
+// 
+// workflowDir = "${params.rootDir}/workflows"
+// targetDir = "${params.rootDir}/target/nextflow"
+
+// TODO 2: insert custom imports here
+
+// TODO 3: unquote
+// docker {
+//   runOptions = "-v \$(realpath ${params.rootDir}):\$(realpath ${params.rootDir})"
+// }
+
+
--- a/resources_test_scripts/10x_20k_fixed.sh
+++ b/resources_test_scripts/10x_20k_fixed.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_5k_fixed
+OUT="resources_test/$ID"
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# create tempdir
+MY_TEMP="${VIASH_TEMP:-/tmp}"
+TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+
+# dataset page:
+# https://www.10xgenomics.com/datasets/mixture-of-healthy-and-cancer-ffpe-tissues-dissociated-using-miltenyi-ffpe-tissue-dissociation-kit-multiplexed-samples-4-probe-barcodes-1-standard
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.1.0/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 200000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
+
+seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R2_001.fastq.gz"
+
+# download feature reference
+feature_ref="$raw_dir/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv"
+if [[ ! -f "$feature_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref"
+fi
+
+# download probe set
+probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv"
+if [[ ! -f "$probe_set" ]]; then
+  wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv" -O "$probe_set"
+fi
+
+sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set"
+
+probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv"
+if [[ ! -f "$probe_set_corrected" ]]; then
+  reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
+  gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf" 
+  cat "$probe_set" | while read line || [[ -n $line ]];
+  do
+    echo "Line: $line"
+    old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' )
+    echo "Old ID: $old_id"
+    if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then
+      echo "Just writing line"
+      printf "%s\n" "$line" >> "$probe_set_corrected"
+    else
+      gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;)
+      if [ ! -z "$gtf_lookup" ]; then
+        echo "Found hit"
+        new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
+        echo "New ID: $new_id"
+        new_line=${line/"$old_id"/"$new_id"}
+        printf "%s\n" "$new_line" >> "$probe_set_corrected"
+      else
+        echo "Did not find hit"
+      fi
+    fi
+  done
+fi
+
+# # Input FASTA:
+# #   >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
+# # Output FASTA:
+# #   >chr1 1
+# input_fastq="$HOME/.cache/openpipeline/GRCh38.primary_assembly.genome.fa.gz"
+# fasta_modified="$TMPDIR/GRCh38.primary_assembly.genome.modified.fa"
+# if [[ ! -f "$input_fastq" ]]; then
+#   wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" -O "$input_fastq"
+# fi
+# zcat "$input_fastq" \
+#     | sed -E 's/^>(\S+).*/>\1 \1/' \
+#     | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
+#     | sed -E 's/^>MT />chrM /' \
+#     > "$fasta_modified"
+
+# pigz --fast "$fasta_modified"
+# fasta_modified="$fasta_modified.gz"
+# # Input GTF:
+# #     ... gene_id "ENSG00000223972.5"; ...
+# # Output GTF:
+# #     ... gene_id "ENSG00000223972"; gene_version "5"; ...
+# input_gtf="$HOME/.cache/openpipeline/gencode.v41.annotation.gtf.gz"
+# gtf_modified="$TMPDIR/gencode.v41.annotation.gtf.modified.gtf"
+# if [[ ! -f "$input_gtf" ]]; then
+#   wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" -O "$input_gtf"
+# fi
+
+# REGEX="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)"
+# zcat "$input_gtf" \
+#     | sed -E 's/gene_id "'"$REGEX"'";/gene_id "\1"; gene_version "\3";/' \
+#     | sed -E 's/transcript_id "'"$REGEX"'";/transcript_id "\1"; transcript_version "\3";/' \
+#     | sed -E 's/exon_id "'"$REGEX"'";/exon_id "\1"; exon_version "\3";/' \
+#     > "$gtf_modified"
+# pigz --fast "$gtf_modified"
+# gtf_modified="$gtf_modified.gz"
+
+final_genome="$HOME/.cache/openpipeline/GRCh38.cellranger.genome.fa.gz"
+if [ ! -f "$final_genome" ]; then
+  NXF_VER=21.10.6 nextflow \
+    run . \
+    -main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
+    -profile docker \
+    -resume \
+    --id "GRCh38" \
+    --genome_fasta "$fasta_modified" \
+    --transcriptome_gtf "$gtf_modified" \
+    --target "cellranger" \
+    --output_fasta "reference.fa.gz" \
+    --output_gtf "reference.gtf.gz" \
+    --output_cellranger "GRCh38.cellranger.genome.fa.gz" \
+    --publish_dir "$HOME/.cache/openpipeline/"
+fi
+
+
+# Run mapping pipeline
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - ${orig_sample_id}_subset
+  library_type:
+    - "Gene Expression"
+  library_lanes:
+    - "any"
+
+probe_set: "$probe_set_corrected"
+gex_reference: "$genome_tar"
+feature_reference: "$feature_ref"
+publish_dir: "$OUT/processed"
+probe_barcode_ids:
+  - BC001
+  - BC002
+  - BC003
+  - BC004
+sample_ids:
+  - Liver_BC1
+  - Ovarian_BC2
+  - Colorectal_BC3
+  - Pancreas_BC4
+gex_generate_bam: false
+sample_force_cells:
+  - 5000
+  - -1
+  - -1
+  - -1
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels_ci.config
--- a/resources_test_scripts/10x_5k_anticmv.sh
+++ b/resources_test_scripts/10x_5k_anticmv.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+set -eo pipefail
+
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_5k_anticmv
+OUT=resources_test/$ID
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# dataset page:
+# https://www.10xgenomics.com/resources/datasets/integrated-gex-totalseqc-and-tcr-analysis-of-connect-generated-library-from-5k-cmv-t-cells-2-standard
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/5k_human_antiCMV_T_TBNK_connect_Multiplex"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 200000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="5k_human_antiCMV_T_TBNK_connect"
+
+seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R2_001.fastq.gz"
+
+seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R1_001.fastq.gz"
+seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R2_001.fastq.gz"
+
+seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R2_001.fastq.gz"
+
+# download immune panel fasta if needed
+feature_reference="$raw_dir/feature_reference.csv"
+if [[ ! -f "$feature_reference" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_count_feature_reference.csv" -O "$feature_reference"
+fi
+
+# download vdj reference if needed
+vdj_ref="$raw_dir/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"
+if [[ ! -f "$vdj_ref" ]]; then
+  wget "https://cf.10xgenomics.com/supp/cell-vdj/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" -O "$vdj_ref"
+fi
+
+
+# Run mapping pipeline
+# TODO: Also include conversion to h5mu
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - "${orig_sample_id}_GEX_1_subset"
+    - "${orig_sample_id}_AB_subset"
+    - "${orig_sample_id}_VDJ_subset"
+  library_type:
+    - "Gene Expression"
+    - "Antibody Capture"
+    - "VDJ"
+
+gex_reference: "$genome_tar"
+vdj_reference: "$vdj_ref"
+feature_reference: "$feature_reference"
+publish_dir: "$OUT/processed"
+HERE
+
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config \
+  -c src/workflows/utils/errorstrat_ignore.config
+
+# Create h5mu
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/processed/10x_5k_anticmv.cellranger_multi.output.output"
+publish_dir: "$OUT/"
+output: "$orig_sample_id.h5mu"
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config
+
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/$orig_sample_id.h5mu"
+publish_dir: "$OUT/"
+output: "${orig_sample_id}_mms.h5mu"
+HERE
+
+# Run full pipeline
+nextflow \
+  run . \
+  -main-script src/workflows/multiomics/full_pipeline/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config
+
+# create fastqc directory
+fastqc_dir="$OUT/fastqc"
+mkdir -p "$fastqc_dir"
+
+./target/docker/qc/fastqc/fastqc \
+  --input "$raw_dir" \
+  --mode "dir" \
+  --output "$fastqc_dir"
--- a/resources_test_scripts/10x_5k_beam.sh
+++ b/resources_test_scripts/10x_5k_beam.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_5k_beam
+OUT="resources_test/$ID"
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# dataset page:
+# https://www.10xgenomics.com/datasets/5k-human-a0201-b0702-pbmcs-beam-t-2-standard
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/5k_human_A0201_B0702_PBMCs_BEAM_T"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 200000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="beamt_human_A0201_B0702_pbmc"
+
+seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R2_001.fastq.gz"
+
+seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R1_001.fastq.gz" 
+seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R2_001.fastq.gz" 
+
+seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R1_001.fastq.gz" 
+seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R2_001.fastq.gz" 
+
+# download feature reference
+feature_ref="$raw_dir/beamt_human_A0201_B0702_pbmc_feature_reference.csv"
+if [[ ! -f "$feature_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_count_feature_reference.csv" -O "$feature_ref"
+fi
+
+# download vdj reference if needed
+vdj_ref="$raw_dir/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz"
+if [[ ! -f "$vdj_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" -O "$vdj_ref"
+fi
+
+# Run mapping pipeline
+# TODO: Also include conversion to h5mu
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - "${orig_sample_id}_gex_subset"
+    - "${orig_sample_id}_vdj_subset"
+    - "${orig_sample_id}_ag_subset"
+  library_type:
+    - "Gene Expression"
+    - "VDJ-T"
+    - "Antigen Capture"
+
+gex_reference: "$genome_tar"
+feature_reference: "$feature_ref"
+vdj_reference: "$vdj_ref"
+control_id:
+    - negative_control_A0201
+    - negative_control_B0702
+mhc_allele:
+    - "HLA-A*02:01"
+    - "HLA-B*07:02"
+publish_dir: "$OUT/processed"
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels_ci.config
+
+# Create h5mu
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/processed/$ID.cellranger_multi.output"
+publish_dir: "$OUT/"
+output: "$orig_sample_id.h5mu"
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels_ci.config
--- a/resources_test_scripts/10x_5k_lung_crispr.sh
+++ b/resources_test_scripts/10x_5k_lung_crispr.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_5k_lung_crispr
+OUT="resources_test/$ID"
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# dataset page:
+# https://www.10xgenomics.com/resources/datasets/5-k-a-549-lung-carcinoma-cells-no-treatment-transduced-with-a-crispr-pool-3-1-standard-6-0-0
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 200000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="SC3_v3_NextGem_DI_CRISPR_A549_5K"
+
+seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R2_001.fastq.gz"
+
+seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R2_001.fastq.gz"
+
+
+# download crispr feature reference
+crispr_ref="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv"
+if [[ ! -f "$crisp_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv" -O "$crispr_ref"
+fi
+
+crispr_ref_adjusted="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference_corrected.csv"
+reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
+cat "$crispr_ref" | while read line || [[ -n $line ]];
+do 
+  echo "Line: $line"
+  old_id=$( printf "%s\n" "$line" | awk -F',' '{print $7}' )
+  echo "Old ID: $old_id"
+  if [ "$old_id" = "Non-Targeting" ] || [ "$old_id" = "target_gene_id" ] ; then
+    echo "Just writing line"
+    printf "%s\n" "$line" >> "$crispr_ref_adjusted"
+  else
+    gtf_lookup=$(zgrep "$old_id" "$reference_gtf" || test $? = 1;)
+    if [ ! -z "$gtf_lookup" ]; then
+      echo "Found hit"
+      new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
+      echo "New ID: $new_id"
+      new_line=${line/"$old_id"/"$new_id"}
+      printf "%s\n" "$new_line" >> "$crispr_ref_adjusted"
+    else
+      echo "Did not find hit"
+    fi
+  fi
+done
+
+
+# Run mapping pipeline
+# TODO: Also include conversion to h5mu
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - "${orig_sample_id}_gex_subset"
+    - "${orig_sample_id}_crispr_subset"
+  library_type:
+    - "Gene Expression"
+    - "CRISPR Guide Capture"
+
+gex_reference: "$genome_tar"
+feature_reference: "$crispr_ref_adjusted"
+publish_dir: "$OUT/processed"
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config
+
+# Create h5mu
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/processed/10x_5k_lung_crispr.cellranger_multi.output"
+publish_dir: "$OUT/"
+output: "$orig_sample_id.h5mu"
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config
--- a/resources_test_scripts/annotation_test_data.sh
+++ b/resources_test_scripts/annotation_test_data.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+ID=annotation_test_data
+OUT=resources_test/$ID/
+
+# ideally, this would be a versioned pipeline run
+[ -d "$OUT" ] || mkdir -p "$OUT"
+
+# Download Tabula Sapiens Blood reference h5ad from https://doi.org/10.5281/zenodo.7587774
+wget "https://zenodo.org/record/7587774/files/TS_Blood_filtered.h5ad?download=1" -O "${OUT}/tmp_TS_Blood_filtered.h5ad"
+
+# Download Tabula Sapiens Blood pretrained model from https://doi.org/10.5281/zenodo.7580707
+wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz?download=1" -O "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
+
+# Download PopV specific CL ontology files - needed for OnClass
+# OUT_ONTOLOGY="${OUT}/ontology"
+# [ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY"
+# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.obo \
+# -O "${OUT_ONTOLOGY}/cl.obo"
+# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology \
+# -O "${OUT_ONTOLOGY}/cl.ontology"
+# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology.nlp.emb \
+# -O "${OUT_ONTOLOGY}/cl.ontology.nlp.emb"
+
+
+# Process Tabula Sapiens Blood reference h5ad
+# (Select one individual and 100 cells per cell type)
+python <<HEREDOC
+import anndata as ad
+ref_adata = ad.read_h5ad("${OUT}/tmp_TS_Blood_filtered.h5ad")
+sub_ref_adata = ref_adata[ref_adata.obs["donor_assay"] == "TSP14_10x 3' v3"] 
+n=100
+s=sub_ref_adata.obs.groupby('cell_ontology_class').cell_ontology_class.transform('count')
+sub_ref_adata_final = sub_ref_adata[sub_ref_adata.obs[s>=n].groupby('cell_ontology_class').head(n).index]
+# assert sub_ref_adata_final.shape == (500, 58870)
+sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
+HEREDOC
+
+
+echo "> Converting to h5mu"
+viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5ad" \
+    --output "${OUT}/TS_Blood_filtered.h5mu" \
+    --modality "rna"
+
+rm "${OUT}/tmp_TS_Blood_filtered.h5ad"
+
+echo "> Downloading pretrained CellTypist model and sample test data"
+wget https://celltypist.cog.sanger.ac.uk/models/Pan_Immune_CellTypist/v2/Immune_All_Low.pkl \
+    -O "${OUT}/celltypist_model_Immune_All_Low.pkl"
+wget https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_2000_cells.h5ad \
+    -O "${OUT}/demo_2000_cells.h5ad"
+viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/demo_2000_cells.h5ad" \
+    --output "${OUT}/demo_2000_cells.h5mu" \
+    --modality "rna"
+
+
+echo "> Fetching OnClass data and models"
+OUT_ONTOLOGY="${OUT}/ontology"
+[ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY"
+wget https://figshare.com/ndownloader/files/28394466 -O "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz"
+tar -xzvf "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz" -C "${OUT_ONTOLOGY}" --strip-components=2
+rm "${OUT_ONTOLOGY}/allen.ontology"
+rm "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz"
+
+wget https://figshare.com/ndownloader/files/28394541 -O "${OUT}/OnClass_models.tar.gz"
+tar -xzvf "${OUT}/OnClass_models.tar.gz" -C "${OUT}" --strip-components=1
+rm "${OUT}/OnClass_models.tar.gz"
+rm "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
+
+find "${OUT}/Pretrained_model" ! -name "example_file_model*" -type f -exec rm -f {} +
+mv "${OUT}/Pretrained_model" "${OUT}/onclass_model"
--- a/resources_test_scripts/aws_sync.sh
+++ b/resources_test_scripts/aws_sync.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -eo pipefail
+
+aws s3 sync --profile di "resources_test" "s3://openpipelines-data" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun
+
+id=cellranger_tiny_fastq
+aws s3 sync --profile di "resources_test/$id" "s3://openpipelines-data/$id" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun
--- a/resources_test_scripts/bdrhap_5kjrt.sh
+++ b/resources_test_scripts/bdrhap_5kjrt.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=bdrhap_5kjrt
+OUT=resources_test/$ID
+n_threads=30
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1"
+genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/12WTA-ABC-SMK-EB-5kJRT"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+    wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/12WTA-ABC-SMK-EB-5kJRT.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+genome_dir="$raw_dir/temp_reference_gencodev41_chr1"
+if [[ ! -d "$genome_dir" ]]; then
+  echo "> Untarring genome"
+  mkdir -p "$genome_dir"
+  tar -xvf "$genome_tar" -C "$genome_dir"
+fi
+
+# process WTA fastq files
+# map to chr1, subsample chr1 reads 
+mapping_dir="$raw_dir/temp_mapping_chr_1"
+if [[ ! -f "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" ]]; then
+  echo "> Processing 12WTA_S1_L432_R[12]_001.fastq.gz"
+  mkdir -p "$mapping_dir"
+  # MUST USE A STAR THAT IS COMPATIBLE WITH BD RHAPSODY
+  # For the cwl pipeline 1.9.1, 2.5.2b should work.
+  echo "star"
+  docker run --rm -i \
+    -v "`pwd`/$OUT:`pwd`/$OUT" \
+    -v "$tar_dir:$tar_dir" \
+    -w `pwd` bdgenomics/rhapsody:1.10.1 \
+    STAR \
+      --runThreadN "$n_threads" \
+      --genomeDir "$genome_dir" \
+      --readFilesIn "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" \
+      --runRNGseed 100 \
+      --outFileNamePrefix "$mapping_dir/" \
+      --readFilesCommand "gzip -d -k -c" \
+      --clip3pAdapterSeq "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
+      --outFilterMatchNmin "25" \
+      --quantTranscriptomeBan "Singleend" # Prohibit mapping of one side of the read
+  # chown to current user before removing mapping dir
+  docker run --rm -i -v "`pwd`/$OUT:`pwd`/$OUT" -w `pwd` bdgenomics/rhapsody:1.10.1 \
+    chown "$(id -u):$(id -g)" --silent --recursive "$mapping_dir/"
+
+  echo "samtools"
+  samtools view -F 260 "$mapping_dir/Aligned.out.sam" > "$mapping_dir/primary_aligned_reads.sam"
+  echo "cut"
+  cut -f 1 "$mapping_dir/primary_aligned_reads.sam" | sort | uniq > "$mapping_dir/mapped_reads.txt"
+  head -500000 "$mapping_dir/mapped_reads.txt" > "$mapping_dir/mapped_reads_subset.txt"
+  echo "seqkit"
+  seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R1_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq"
+  seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq"
+
+  # rm -r "$mapping_dir"
+  # rm -r "$genome_dir"
+fi
+
+# subsample other files
+smk_r1_file="$raw_dir/12SMK_S1_L432_R1_001_subset.fastq.gz"
+if [[ ! -f "$smk_r1_file" ]]; then
+  echo "> Processing `basename $smk_r1_file`"
+  seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R1_001.fastq.gz" | gzip > "$smk_r1_file"
+fi
+smk_r2_file="$raw_dir/12SMK_S1_L432_R2_001_subset.fastq.gz"
+if [[ ! -f "$smk_r2_file" ]]; then
+  echo "> Processing `basename $smk_r2_file`"
+  seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R2_001.fastq.gz" | gzip > "$smk_r2_file"
+fi
+abc_r1_file="$raw_dir/12ABC_S1_L432_R1_001_subset.fastq.gz"
+if [[ ! -f "$abc_r1_file" ]]; then
+  echo "> Processing `basename $abc_r1_file`"
+  seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R1_001.fastq.gz" | gzip > "$abc_r1_file"
+fi
+abc_r2_file="$raw_dir/12ABC_S1_L432_R2_001_subset.fastq.gz"
+if [[ ! -f "$abc_r2_file" ]]; then
+  echo "> Processing `basename $abc_r2_file`"
+  seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R2_001.fastq.gz" | gzip > "$abc_r2_file"
+fi
+wta_r1_file="$raw_dir/12WTA_S1_L432_R1_001_subset.fastq.gz"
+if [[ ! -f "$wta_r1_file" ]]; then
+  echo "> Processing `basename $wta_r1_file`"
+  gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" > "$wta_r1_file"
+fi
+wta_r2_file="$raw_dir/12WTA_S1_L432_R2_001_subset.fastq.gz"
+if [[ ! -f "$wta_r2_file" ]]; then
+  echo "> Processing `basename $wta_r2_file`"
+  gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq" > "$wta_r2_file"
+fi
+# copy immune panel fasta
+fasta_file="$raw_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta"
+if [[ ! -f "$fasta_file" ]]; then
+  cp "$tar_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta" "$fasta_file"
+fi
+
+genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz"
+
+nextflow run . \
+  -main-script target/nextflow/workflows/ingestion/bd_rhapsody/main.nf  \
+  -resume \
+  -profile docker,mount_temp \
+  -c src/workflows/utils/labels_ci.config \
+  -c src/workflows/utils/errorstrat_ignore.config \
+  --reads "$wta_r1_file;$wta_r2_file;$abc_r1_file;$abc_r2_file;$smk_r1_file;$smk_r2_file" \
+  --reference_archive "$genome_tar" \
+  --abseq_reference "$fasta_file" \
+  --sample_tags_version "hs" \
+  --tag_names "1-Jurkat;2-Ramos;3-THP1" \
+  --output_raw "output_raw" \
+  --output "output.h5mu" \
+  --output_state state.yaml \
+  --cell_calling_data "mRNA" \
+  --exact_cell_count 4000 \
+  --generate_bam true \
+  --publish_dir "$OUT/processed"
--- a/resources_test_scripts/bdrhap_vdj.sh
+++ b/resources_test_scripts/bdrhap_vdj.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# TODO: we should turn this into viash components
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=bdrhap_vdj
+OUT=resources_test/$ID
+n_threads=30
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/VDJDemo"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+    wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/VDJDemo/VDJDemo.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+# subset fastq files
+for sample_id in RhapVDJDemo-BCR_S1_L001_R1_001 RhapVDJDemo-BCR_S1_L001_R2_001 RhapVDJDemo-mRNA_S5_L001_R1_001 RhapVDJDemo-mRNA_S5_L001_R2_001 RhapVDJDemo-TCR_S3_L001_R1_001 RhapVDJDemo-TCR_S3_L001_R2_001; do
+  subset_file="$raw_dir/${sample_id}_subset.fastq.gz"
+  if [[ ! -f "$subset_file" ]]; then
+  echo "> Processing $sample_id"
+    seqkit head -n 300000 "$tar_dir/$sample_id.fastq.gz" | gzip > "$subset_file"
+  fi
+  unset subset_file
+done
+
+# copy immune panel fasta
+fasta_file="$raw_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta"
+if [[ ! -f "$fasta_file" ]]; then
+  cp "$tar_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta" "$fasta_file"
+fi
+
+# create params file
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "targeted_vdj"
+  input: "$raw_dir/RhapVDJDemo-*_S*_L001_R[12]_001_subset.fastq.gz"
+mode: targeted
+reference: "$fasta_file"
+publish_dir: "$OUT/processed"
+putative_cell_call: "mRNA"
+vdj_version: human
+HERE
+
+# run bd rhapsody pipeline
+nextflow \
+  run . \
+  -main-script src/workflows/ingestion/bd_rhapsody/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -with-trace work/trace.txt \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config \
+  -c src/workflows/utils/errorstrat_ignore.config
--- a/resources_test_scripts/cellranger_atac_tiny_bcl.sh
+++ b/resources_test_scripts/cellranger_atac_tiny_bcl.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# settings
+ID=cellranger_atac_tiny_bcl
+OUT="resources_test/$ID/"
+DIR="$OUT"
+REFERENCE_DIR=resources_test/reference_gencodev41_chr1
+
+# create tempdir
+MY_TEMP="${VIASH_TEMP:-/tmp}"
+TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+trap clean_up EXIT
+
+viash ns build -q "download_file|cellranger_atac_mkfastq|build_cellranger_arc_reference|cellranger_atac_count" -p docker --setup cb
+
+
+# download bcl data
+if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
+  mkdir -p "$OUT/bcl"
+
+  # download tar gz
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-1.0.0.tar.gz \
+    --output "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz"
+  
+  # untar
+  
+  tar -xf "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz" \
+    --strip-components=1 \
+    -C "$OUT/bcl"
+
+  # remove tar
+  rm "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz"
+
+  # Download the layout file. It contains info about the samples (1 in this case) and lanes
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-simple-1.0.0.csv \
+    --output "${OUT}/bcl/layout.csv"
+
+  # download sample sheet
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-samplesheet-1.0.0.csv \
+    --output "${OUT}/bcl/sample_sheet.csv"
+fi
+
+if [ ! -d "${OUT}/fastqs" ]; then
+  mkdir -p "$OUT/fastqs"
+
+  target/docker/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq \
+    --input "${OUT}/bcl" \
+    --csv "${OUT}/bcl/layout.csv" \
+    --output "${OUT}/fastqs"
+fi
+
+# Create count matrices
+if [ ! -d "${OUT}/counts" ]; then
+  mkdir -p "$OUT/counts"
+  
+  target/docker/mapping/cellranger_atac_count/cellranger_atac_count \
+    --input "${OUT}/fastqs/HJN3KBCX2/test_sample/" \
+    --reference "${REFERENCE_DIR}/reference_cellranger.tar.gz" \
+    --output "${OUT}/counts"
+fi
--- a/resources_test_scripts/cellranger_tiny_bcl.sh
+++ b/resources_test_scripts/cellranger_tiny_bcl.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# settings
+ID=cellranger_tiny_bcl
+OUT="resources_test/$ID/"
+DIR="$OUT"
+
+# create tempdir
+MY_TEMP="${VIASH_TEMP:-/tmp}"
+TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+trap clean_up EXIT
+
+# download bcl data
+if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
+  mkdir -p "$OUT/bcl"
+
+  # download tar gz
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-1.2.0.tar.gz \
+    --output "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz"
+  
+  # untar
+  tar -xf "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz" \
+    --strip-components=1 \
+    -C "$OUT/bcl"
+
+  # remove tar
+  rm "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz"
+
+  # download sample sheet
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-simple-1.2.0.csv \
+    --output "${OUT}/bcl/sample_sheet.csv"
+fi
+
+if [ ! -d "${OUT}/fastqs" ]; then
+  mkdir -p "$OUT/fastqs"
+
+  target/docker/demux/cellranger_mkfastq/cellranger_mkfastq \
+    --input "${OUT}/bcl" \
+    --sample_sheet "${OUT}/bcl/sample_sheet.csv" \
+    --output "${OUT}/fastqs"
+fi
+
+# bcl-convert requires a v2 sample sheet
+# bcl-convert is a bit more strict concerning filter files being present or not.
+# We make a copy and make the necessary adaptations. 
+
+# We are using the tiny bcl dataset provided by Illumina:
+#   https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq
+# Unfortunately,
+#   1. the sample sheet delivered with it does not work with bcl-convert (v1 of the format)
+#   2. 2 filter files are missing from the run directory that bcl-convert requires to run
+#
+# We worked around this by
+#   1. Manually editing a sample sheet file suited for bcl-convert (format v2)
+#   2. Adding a filter file
+#
+# The filter file is a binary file, we just created an empty file use that.
+# bcl-convert might complain about it, but at least something is written out.
+# An alternative is to use a filter file from a different project. This also generates
+# a warning, but the fastq ouput files contain reads. The drawback is that those filter files
+# are generally above 100MB in size.
+#
+# TODO: Check if a (binary) filter file can be generated that is small but works.
+
+if [ ! -f "${OUT}/bcl2/sample_sheet.csv" ]; then
+  mkdir "${OUT}/bcl2/"
+  cp -r ${OUT}/bcl/* "${OUT}/bcl2/"
+  cat > "${OUT}/bcl2/sample_sheet.csv" << HERE
+[Header],,,,,,,,,
+FileFormatVersion,2,,,,,,
+RunName,hiseq_test,,,,,,
+InstrumentPlatform,NextSeq,,,,,,
+IndexOrientation,Forward,,,,,,
+,,,,,,,,,
+[Reads],,,,,,,,,
+Read1Cycles,26,,,,,,,,,
+Read2Cycles,98,,,,,,
+,,,,,,,,,
+[Sequencing_Settings],,,,,,,
+,,,,,,,
+[BCLConvert_Settings],,,,,,,
+SoftwareVersion,3.8.4,,,,,,
+NoLaneSplitting,true,,,,,,
+FastqCompressionFormat,gzip,,,,,,
+,,,,,,,,,
+[BCLConvert_Data],,,,,,,
+Sample_ID,index,,,,,,
+s1,GGTTTACT,,,,,,
+,,,,,,,
+[Cloud_Settings],,,,,,,
+GeneratedVersion,1.3.0.202111171923,,,,,,
+,,,,,,,
+[Cloud_Data],,,,,,,
+Sample_ID,ProjectName,LibraryName,LibraryPrepKitName,IndexAdapterKitName,I7_Index_ID,Sample_Name,Description,Instrument,Type
+s1,p1,s1_SI-P03-C9,,,IDT01,SI-P03-C9,s1,NextSeq,HighOutput_75cycles
+HERE
+  
+  touch "${OUT}/bcl2/Data/Intensities/BaseCalls/L001/s_1_1101.filter"
+fi
--- a/resources_test_scripts/cellranger_tiny_fastq.sh
+++ b/resources_test_scripts/cellranger_tiny_fastq.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# settings
+ID=cellranger_tiny_fastq
+OUT="resources_test/$ID/"
+DIR="$OUT"
+
+# download cellranger tar gz
+cellranger_tar_gz="${OUT}/temp_cellranger-6.1.2.tar.gz"
+if [ ! -f "$cellranger_tar_gz" ]; then
+  echo "Download Cell Ranger 6.1.2 manually first!"
+  exit 1
+fi
+
+# untar fastqs
+cellranger_tiny_fastq="${OUT}/cellranger_tiny_fastq"
+if [ ! -f "${cellranger_tiny_fastq}/tinygex_S1_L001_R1_001.fastq.gz" ]; then
+  mkdir -p "$cellranger_tiny_fastq"
+  
+  tar -xzf "$cellranger_tar_gz" \
+    -C "$cellranger_tiny_fastq" \
+    "cellranger-6.1.2/external/cellranger_tiny_fastq" \
+    --strip-components=3
+fi
+
+# untar ref
+cellranger_tiny_ref="${OUT}/cellranger_tiny_ref"
+if [ ! -f "${cellranger_tiny_ref}/reference.json" ]; then
+  mkdir -p "$cellranger_tiny_ref"
+  
+  tar -xzf "$cellranger_tar_gz" \
+    -C "$cellranger_tiny_ref" \
+    "cellranger-6.1.2/external/cellranger_tiny_ref" \
+    --strip-components=3
+fi
+
+# Create ref with more recent STAR version
+recent_ref_dir="${OUT}/cellranger_tiny_ref_v2_7_10_a"
+if [ ! -f "${recent_ref_dir}/Genome" ]; then
+  mkdir -p "${recent_ref_dir}"
+
+  target/docker/mapping/star_build_reference/star_build_reference \
+    --genome_fasta "$cellranger_tiny_ref/fasta/genome.fa" \
+    --output "$recent_ref_dir" \
+    --genomeSAindexNbases 7 \
+    --transcriptome_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz"
+fi
+
+# run cellranger count
+bam_dir="${OUT}/bam"
+if [ ! -f "$bam_dir/possorted_genome_bam.bam" ]; then
+  mkdir -p "$bam_dir"
+
+  viash run src/mapping/cellranger_count/config.vsh.yaml -- \
+    --input "$cellranger_tiny_fastq" \
+    --reference "$cellranger_tiny_ref" \
+    --output "$bam_dir"
+fi
+
+# convert to h5mu
+raw_h5mu="${OUT}/raw_dataset.h5mu"
+if [ ! -f "$step1_h5mu" ]; then
+  viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
+    --input "${bam_dir}/raw_feature_bc_matrix.h5" \
+    --output "$raw_h5mu"
+fi
+
+# run velocyto
+velo_gtf="$cellranger_tiny_ref/genes/genes.gtf.gz"
+velo_bam="$bam_dir/possorted_genome_bam.bam"
+velo_loom="${OUT}/velocyto.loom"
+if [ ! -f "$velo_loom" ]; then
+  viash run src/velocity/velocyto/config.vsh.yaml -- \
+    --input "$velo_bam" \
+    --output "$velo_loom" \
+    --transcriptome "$velo_gtf"
+fi
+
+# combine raw counts with velocyto data
+dataset_h5mu="${OUT}/dataset.h5mu"
+if [ ! -f "$dataset_h5mu" ]; then
+  viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \
+    --input_loom "$velo_loom" \
+    --input_h5mu "$raw_h5mu" \
+    --output "$dataset_h5mu"
+fi
+
+# run htseq
+htseq_counts="${OUT}/htseq_counts.tsv"
+if [ ! -f "$htseq_counts" ]; then
+  viash run src/mapping/htseq_count/config.vsh.yaml -- \
+  --input "$velo_bam" \
+  --reference "$velo_gtf" \
+  --output "$htseq_counts"
+fi
+
+multi_star="${OUT}/multi_star"
+if [ ! -d "$multi_star" ]; then
+  viash run src/mapping/multi_star/config.vsh.yaml -- \
+    --input_id "tinygex" \
+    --input_r1 "$cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz" \
+    --input_r2 "$cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz" \
+    --input_id "tinygex" \
+    --input_r1 "$cellranger_tiny_fastq/tinygex_S1_L002_R1_001.fastq.gz" \
+    --input_r2 "$cellranger_tiny_fastq/tinygex_S1_L002_R2_001.fastq.gz" \
+    --reference_index "$recent_ref_dir" \
+    --reference_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz" \
+    --output "$multi_star" \
+    ---cpus 30
+fi
--- a/resources_test_scripts/concat_test_data.sh
+++ b/resources_test_scripts/concat_test_data.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# The output folder
+OUT="resources_test/concat_test_data/"
+
+# create it if it doesn't exist already
+[ -d "$OUT" ] || mkdir -p "$OUT"
+
+echo "> Downloading files"
+target/docker/download/download_file/download_file \
+  --input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/e18_mouse_brain_fresh_5k/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5 \
+  --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5"
+
+target/docker/download/download_file/download_file \
+  --input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/human_brain_3k/human_brain_3k_filtered_feature_bc_matrix.h5 \
+  --output "${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5"
+
+echo "> Converting to h5mu"
+viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
+  --input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \
+  --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu"
+
+viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
+  --input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5" \
+  --output "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu"
+
+echo "> Subsetting datasets"
+viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
+  --input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu" \
+  --output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
+  --number_of_observations 2000
+
+viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
+  --input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \
+  --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
+  --number_of_observations 2000
+
+echo "Making observation ids unique (required for concat component to function)"
+viash run src/metadata/add_id/config.vsh.yaml -- \
+--input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
+--output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
+--input_id "human" \
+--make_observation_keys_unique
+
+viash run src/metadata/add_id/config.vsh.yaml -- \
+--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
+--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
+--input_id "mouse" \  
+--make_observation_keys_unique
+
+echo "Removing temp files"
+rm "${OUT}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \
+   "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \
+   "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
+   "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
+   "${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5mu" \
+   "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5"
+   
+
+echo "> Running concat component"
+viash run src/dataflow/concat/config.vsh.yaml -- \
+  --input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu,$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
+  --input_id "human,mouse" \
+  --output "$OUT/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"
--- a/resources_test_scripts/demuxafy_test_data.sh
+++ b/resources_test_scripts/demuxafy_test_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+set -eo pipefail
+
+
+# settings
+ID=demuxafy_test_data
+OUT=resources_test/$ID
+DIR="$OUT"
+
+mkdir -p "$OUT"
+cd "$OUT"
+# download demuxafy test dataset
+wget https://www.dropbox.com/s/m8u61jn4i1mcktp/TestData4PipelineSmall.tar.gz
+tar -xf TestData4PipelineSmall.tar.gz
+# bam and vcf file
+cp TestData4PipelineSmall/test_dataset/outs/pooled.sorted.bam.bai .
+cp TestData4PipelineSmall/test_dataset/outs/pooled.sorted.bam .
+cp TestData4PipelineSmall/test_dataset.vcf .
+# extract chr from vcf file
+grep -w '^#\|^#CHROM\|^[1-2]' test_dataset.vcf > test_dataset_chr1_2.vcf
+grep -w '^#\|^#CHROM\|^[3-4]' test_dataset.vcf > test_dataset_chr3_4.vcf
+
+# barcode list
+cp TestData4PipelineSmall/test_dataset/outs/filtered_gene_bc_matrices/Homo_sapiens_GRCh38p10/barcodes.tsv .
+
+# subsetted bam and bai for souporcell
+wget https://www.dropbox.com/s/7ew5lt0msf4z5gj/chr_1_pooled.sorted.bam
+wget https://www.dropbox.com/s/tpplbj9sab9b2p4/chr_1_pooled.sorted.bam.bai
+
+# variants from mixed sample
+wget https://www.dropbox.com/s/btir7ge4kzc7tu1/mixed_variant.vcf
+
+# dsc_pileup output
+wget https://www.dropbox.com/s/17hj9i0yavtezx1/dsc_pileup.zip
+unzip dsc_pileup.zip
+
+# subsetted human genome reference
+wget https://www.dropbox.com/s/ynlce3g7nwxthwg/genome_chr1.fa
+
+# remove unnecessary files
+rm -rf TestData4PipelineSmall
+rm TestData4PipelineSmall.tar.gz
+rm dsc_pileup.zip
--- a/resources_test_scripts/hlca_reference_model.sh
+++ b/resources_test_scripts/hlca_reference_model.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+ID=HLCA_reference_model
+OUT=resources_test/$ID/$ID
+DIR=$(dirname "$OUT")
+
+# ideally, this would be a versioned pipeline run
+[ -d "$DIR" ] || mkdir -p "$DIR"
+
+# download and unarchive pre-trained scANVI model
+wget https://zenodo.org/record/6337966/files/HLCA_reference_model.zip \
+  -O "${OUT}.zip"
+
+# # Test query data
+# # Source publication: Delorey, Toni M., et al. “COVID-19 tissue atlases reveal SARS-CoV-2 pathology and cellular targets.” Nature 595.7865 (2021): 107-113.
+# wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5230nnn/GSM5230027/suppl/GSM5230027_04-P103142-S149-R01_raw_feature_bc_matrix.h5.gz \
+#   -O "${OUT}_query_test.h5.gz"
+# gzip -d "${OUT}_query_test.h5.gz"
+
+# # Prepare test data as in scvi-tools tutorial: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/query_hlca_knn.html
+# python <<HEREDOC
+# import pandas as pd
+# import scanpy as sc
+
+# geo_metadata_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE171nnn/GSE171668/suppl/GSE171668_lung_metadata.csv.gz"
+# metadata = pd.read_csv(geo_metadata_url, index_col=0)
+
+# DATA_PATH = "${OUT}_query_test.h5"
+# query_data = sc.read_10x_h5(DATA_PATH)
+# # clean up .var.index (gene names)
+# query_data.var['gene_names'] = query_data.var.index
+# query_data.var.index = [idx.split("___")[-1] for idx in query_data.var.gene_ids]
+# # clean up cell barcodes:
+# query_data.obs.index = query_data.obs.index.str.rstrip("-1")
+# # read in metadata (to select only cells of interest and remove empty drops)
+# # subset to cells from our sample
+# metadata = metadata.loc[metadata.donor == "D12_4",:].copy()
+# # clean up barcodes:
+# metadata.index = [idx.split("-")[-1] for idx in metadata.index]
+# # subset adata to cells in metadata:
+# query_data = query_data[metadata.index,:].copy()
+# # add dataset information:
+# query_data.obs['dataset'] = "test_dataset_delorey_regev"
+# sc.write(DATA_PATH, query_data)
+# HEREDOC
+
+# # convert 10x h5 to h5mu
+# viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml -- \
+#   --input "${OUT}_query_test.h5" \
+#   --output "${OUT}_query_test.h5mu"
--- a/resources_test_scripts/merge_test_data.sh
+++ b/resources_test_scripts/merge_test_data.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+
+
+# settings
+ID=merge_test_data
+OUT=resources_test/$ID
+DIR="$OUT"
+
+mkdir -p "$OUT"
+
+target/docker/dataflow/split_modalities/split_modalities \
+  --input resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu \
+  --output "$OUT"
+
--- a/resources_test_scripts/pbmc_1k_protein_v3.sh
+++ b/resources_test_scripts/pbmc_1k_protein_v3.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+ID=pbmc_1k_protein_v3
+OUT=resources_test/$ID/$ID
+DIR=$(dirname "$OUT")
+
+# ideally, this would be a versioned pipeline run
+[ -d "$DIR" ] || mkdir -p "$DIR"
+
+# dataset page:
+# https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-gene-expression-and-cell-surface-protein-3-standard-3-0-0
+
+# download metrics summary
+wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_metrics_summary.csv \
+  -O "${OUT}_metrics_summary.csv"
+
+# download counts h5 file
+wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5 \
+  -O "${OUT}_filtered_feature_bc_matrix.h5"
+
+wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \
+  -O "${OUT}_raw_feature_bc_matrix.h5"
+
+# download counts matrix tar gz file
+wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.tar.gz \
+  -O "${OUT}_filtered_feature_bc_matrix.tar.gz"
+
+# extract matrix tar gz
+mkdir -p "${OUT}_filtered_feature_bc_matrix"
+tar -xvf "${OUT}_filtered_feature_bc_matrix.tar.gz" \
+  -C "${OUT}_filtered_feature_bc_matrix" \
+  --strip-components 1
+rm "${OUT}_filtered_feature_bc_matrix.tar.gz"
+
+# convert 10x h5 to h5mu
+target/docker/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu \
+  --input "${OUT}_filtered_feature_bc_matrix.h5" \
+  --input_metrics_summary "${OUT}_metrics_summary.csv" \
+  --output "${OUT}_filtered_feature_bc_matrix.h5mu"
+
+# run single sample
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/rna/rna_singlesample/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_uss \
+  --input "${OUT}_filtered_feature_bc_matrix.h5mu" \
+  --output "`basename $OUT`_uss.h5mu" \
+  --publishDir `dirname $OUT` \
+  -resume
+
+# add the sample ID to the mudata object
+nextflow \
+  run . \
+  -main-script target/nextflow/metadata/add_id/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_uss \
+  --input "${OUT}_uss.h5mu" \
+  --input_id "pbmc_1k_protein_v3_uss" \
+  --output "`basename $OUT`_uss_with_id.h5mu" \
+  --output_compression "gzip" \
+  --publishDir `dirname $OUT` \
+  -resume
+
+# run multisample
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/rna/rna_multisample/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_ums \
+  --input "${OUT}_uss_with_id.h5mu" \
+  --output "`basename $OUT`_ums.h5mu" \
+  --publishDir `dirname $OUT` \
+  -resume
+
+rm "${OUT}_uss_with_id.h5mu"
+
+# run dimred
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_mms \
+  --input "${OUT}_ums.h5mu" \
+  --output "`basename $OUT`_mms.h5mu" \
+  --publishDir `dirname $OUT` \
+  --obs_covariates sample_id \
+  -resume
+
+# run integration
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/integration/harmony_leiden/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_mms_integration \
+  --input "${OUT}_mms.h5mu" \
+  --output "`basename $OUT`_mms.h5mu" \
+  --publishDir `dirname $OUT` \
+  --obs_covariates sample_id \
+  -resume
+
+python <<HEREDOC
+import mudata as mu
+mudata = mu.read_h5mu("${DIR}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu")
+mudata.mod["rna"].write_h5ad("${DIR}/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad")
+HEREDOC
--- a/resources_test_scripts/ref_gencodev41_chr1.sh
+++ b/resources_test_scripts/ref_gencodev41_chr1.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=reference_gencodev41_chr1
+OUT=resources_test/$ID
+
+mkdir -p "$OUT" 
+
+wget "https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" -O "$OUT/ERCC92.zip"
+
+# Download JASPAR files for reference building
+# Source of the code below: https://support.10xgenomics.com/single-cell-atac/software/release-notes/references#GRCh38-2020-A-2.0.0
+motifs_url="https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
+motifs_in="${OUT}/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
+
+if [ ! -f "$motifs_in" ]; then
+    curl -sS "$motifs_url" > "$motifs_in"
+fi
+
+# Change motif headers so the human-readable motif name precedes the motif
+# identifier. So ">MA0004.1    Arnt" -> ">Arnt_MA0004.1".
+motifs_modified="${OUT}/$(basename "$motifs_in").modified"
+awk '{
+    if ( substr($1, 1, 1) == ">" ) {
+        print ">" $2 "_" substr($1,2)
+    } else {
+        print
+    }
+}' "$motifs_in" > "$motifs_modified"
+
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: "$ID"
+    genome_fasta: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz"
+    transcriptome_gtf: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz"
+    target: ["bd_rhapsody", "cellranger_arc"] 
+    output_fasta: "reference.fa.gz"
+    output_gtf: "reference.gtf.gz"
+    non_nuclear_contigs: null
+    output_cellranger_arc: "reference_cellranger.tar.gz"
+    output_bd_rhapsody: "reference_bd_rhapsody.tar.gz"
+    bdrhap_extra_star_params: "--genomeSAindexNbases 12 --genomeSAsparseD 2"
+    motifs_file: "$motifs_modified"
+    subset_regex: "chr1"
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
+  -profile docker \
+  -c ./src/workflows/utils/labels_ci.config \
+  -params-file /tmp/params.yaml \
+  --publish_dir $OUT \
+  -resume
--- a/resources_test_scripts/remote_param_list.sh
+++ b/resources_test_scripts/remote_param_list.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+mkdir -p "resources_test/remote_param_list/"
+OUT=resources_test/remote_param_list/test_param_list.yaml
+OUT_CSV=resources_test/remote_param_list/test_param_list.csv
+OUT_JSON=resources_test/remote_param_list/test_param_list.json
+
+cat > $OUT << HERE
+- id: "mouse"
+  input: s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
+  publish_dir: "foo_remote/"
+  rna_min_counts: 2
+  prot_min_counts: 3
+- id: "human"
+  input: s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
+  publish_dir: "foo_remote/"
+  rna_min_counts: 2
+  prot_min_counts: 3
+HERE
+
+cat > $OUT_CSV << EOF
+"id","input","publish_dir","rna_min_counts","prot_min_counts"
+"mouse","s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3"
+"human","s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3"
+EOF
+
+cat > $OUT_JSON << HERE
+[
+    {
+        "id": "mouse",
+        "input": "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu",
+        "publish_dir": "foo_remote/",
+        "rna_min_counts": 2,
+        "prot_min_counts": 3
+    },
+    {
+        "id": "human",
+        "input": "s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu",
+        "publish_dir": "foo_remote/",
+        "rna_min_counts": 2,
+        "prot_min_counts": 3
+    }
+]
+HERE
--- a/resources_test_scripts/rna_velocity.sh
+++ b/resources_test_scripts/rna_velocity.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=rna_velocity
+OUT=resources_test/$ID
+
+
+# create raw directory
+velocyto_dir="$OUT/velocyto"
+mkdir -p "$velocyto_dir"
+
+########################################################
+# Create a compatible BAM file from BD Rhapsody Output #
+########################################################
+
+bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/WTA.bd_rhapsody.output_raw/sample_final.BAM"
+
+if [[ ! -f "$bd_rhap_wta_bam" ]]; then
+    echo "$bd_rhap_wta_bam does not exist. Please generate BD Rhapsody test data first."
+    exit 1
+fi
+
+echo "> Converting BD Rhapsody barcode tags."
+viash run src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml -- \
+  -i "$bd_rhap_wta_bam" \
+  -o "$velocyto_dir/compatible_bd_input.bam" \
+  --bam \
+  -t 4
+
+echo "> Creating barcodes file."
+samtools view -@4 "$velocyto_dir/compatible_bd_input.bam" | \
+  grep -oP "(?<=CB:Z:)\S+" | sort | uniq | head > "$velocyto_dir/barcodes.txt"
+
+###########################################################
+# Process Tiny Fast Fastq dataset from 10X to create      #
+# input data for convert/from_velocyto_to_h5mu compontent #
+###########################################################
+
+mkdir "$OUT/velocyto_processed"
+
+gtf="resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz"
+bam="resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam"
+
+echo "> Processing 10x dataset"
+viash run src/velocity/velocyto/config.vsh.yaml -- \
+  -i "$bam" \
+  -o "$OUT/velocyto_processed/cellranger_tiny.loom" \
+  --transcriptome "$gtf"
--- a/resources_test_scripts/scgpt.sh
+++ b/resources_test_scripts/scgpt.sh
@@ -0,0 +1,111 @@
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=scgpt
+OUT=resources_test/$ID
+
+# create foundational model directory
+foundation_model_dir="$OUT/source"
+mkdir -p "$foundation_model_dir"
+
+# install gdown if necessary
+# Check whether gdown is available
+if ! command -v gdown &> /dev/null; then
+    echo "This script requires gdown. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+echo "> Downloading scGPT foundation model (full_human)"
+# download foundational model files (full_human)
+# https://drive.google.com/drive/folders/1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y
+gdown '1H3E_MJ-Dl36AQV6jLbna2EdvgPaqvqcC' -O "${foundation_model_dir}/vocab.json"
+gdown '1hh2zGKyWAx3DyovD30GStZ3QlzmSqdk1' -O "${foundation_model_dir}/args.json"
+gdown '14AebJfGOUF047Eg40hk57HCtrb0fyDTm' -O "${foundation_model_dir}/best_model.pt"
+
+# create test data dir
+test_resources_dir="$OUT/test_resources"
+mkdir -p "$test_resources_dir"
+
+echo "> Downloading test resources"
+# download test data
+# https://drive.google.com/file/d/1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL/view?usp=drive_link
+gdown '1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL' -O "${test_resources_dir}/Kim2020_Lung.h5ad"
+
+echo "> Converting to h5mu"
+python <<HEREDOC
+import anndata as ad
+import mudata as mu
+input_adata = ad.read_h5ad("${test_resources_dir}/Kim2020_Lung.h5ad")
+input_mdata = mu.MuData({'rna': input_adata})
+input_mdata.write_h5mu("${test_resources_dir}/Kim2020_Lung.h5mu")
+HEREDOC
+
+echo "> Subsetting datasets"
+viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung.h5mu" \
+  --output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
+  --number_of_observations 4000
+
+rm "${test_resources_dir}/Kim2020_Lung.h5ad"
+
+echo "> Preprocessing datasets"
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
+  -profile docker \
+  -c src/workflows/utils/labels_ci.config \
+  --input "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
+  --output "Kim2020_Lung_subset_preprocessed.h5mu" \
+  --publish_dir "${test_resources_dir}"
+
+echo "> Filtering highly variable features"
+viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml -p docker -- \
+  --input "${test_resources_dir}/iKim2020_Lung_subset_preprocessed.h5mu" \
+  --output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
+  --layer "log_normalized" \
+  --var_name_filter "filter_with_hvg" \
+  --n_top_features 1200 \
+  --flavor "seurat_v3"
+
+viash run src/filter/do_filter/config.vsh.yaml -p docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
+  --output "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
+  --var_filter "filter_with_hvg"
+  
+echo "> Running scGPT cross check genes"
+viash run src/scgpt/cross_check_genes/config.vsh.yaml -p docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
+  --output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
+  --vocab_file "${foundation_model_dir}/vocab.json"
+
+echo "> Running scGPT binning"
+viash run src/scgpt/binning/config.vsh.yaml -p docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
+  --input_layer "log_normalized" \
+  --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu"
+
+echo "> Running scGPT tokenizing"
+viash run src/scgpt/pad_tokenize/config.vsh.yaml -p docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
+  --input_layer "binned" \
+  --output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
+  --model_vocab "${foundation_model_dir}/vocab.json"
+
+echo "> Running scGPT integration"
+viash run src/scgpt/embedding/config.vsh.yaml -p docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
+  --output "${test_resources_dir}/Kim2020_Lung_subset_scgpt_integrated.h5mu" \
+  --model "${foundation_model_dir}/best_model.pt" \
+  --model_vocab "${foundation_model_dir}/vocab.json" \
+  --model_config "${foundation_model_dir}/args.json" \
+  --obs_batch_label "sample"
+
+echo "> Removing unnecessary files in test resources dir"
+find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete
+
+echo "> scGPT test resources are ready!"
+
--- a/resources_test_scripts/vireo_test_data.sh
+++ b/resources_test_scripts/vireo_test_data.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -eo pipefail
+
+
+# settings
+ID=vireo_test_data
+OUT=resources_test/$ID
+DIR="$OUT"
+
+mkdir -p "$OUT"
+cd "$OUT"
+# download vireo tutorial dataset
+wget https://github.com/single-cell-genetics/vireo/raw/master/data/cells.cellSNP.vcf.gz
--- a/schemas/author.yaml
+++ b/schemas/author.yaml
@@ -0,0 +1 @@
+$ref: "defs_common.yaml#/definitions/Author"
--- a/schemas/defs_common.yaml
+++ b/schemas/defs_common.yaml
@@ -0,0 +1,395 @@
+definitions:
+  Config:
+    description: "A Viash configuration is a YAML file which contains metadata to\
+      \ describe the behaviour and build target(s) of a component.  \nWe commonly\
+      \ name this file `config.vsh.yaml` in our examples, but you can name it however\
+      \ you choose.  \n"
+    type: "object"
+    properties:
+      label:
+        description: "A clean version of the component's name. This is only used for\
+          \ documentation."
+        type: "string"
+      license:
+        description: "The license of the package."
+        type: "string"
+      authors:
+        description: "A list of authors. An author must at least have a name, but\
+          \ can also have a list of roles, an e-mail address, and a map of custom\
+          \ properties.\n\nSuggested values for roles are:\n \n| Role | Abbrev. |\
+          \ Description |\n|------|---------|-------------|\n| maintainer | mnt |\
+          \ for the maintainer of the code. Ideally, exactly one maintainer is specified.\
+          \ |\n| author | aut | for persons who have made substantial contributions\
+          \ to the software. |\n| contributor | ctb| for persons who have made smaller\
+          \ contributions (such as code patches).\n| datacontributor | dtc | for persons\
+          \ or organisations that contributed data sets for the software\n| copyrightholder\
+          \ | cph | for all copyright holders. This is a legal concept so should use\
+          \ the legal name of an institution or corporate body.\n| funder | fnd |\
+          \ for persons or organizations that furnished financial support for the\
+          \ development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\
+          \ is extremely comprehensive.\n"
+        type: "array"
+        items:
+          type: object
+          properties:
+            __merge__:
+              type: string
+              pattern: "^/src/authors/.*\\.yaml$"
+            roles:
+              description: |
+                  Role of the author. Possible values:
+
+                  * `"author"`: Authors who have made substantial contributions to the component.
+                  * `"maintainer"`: The maintainer of the component.
+                  * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.).
+              type: array
+              items:
+                enum: [maintainer, author, contributor]
+      status:
+        description: "Allows setting a component to active, deprecated or disabled."
+        $ref: "defs_viash.yaml#/definitions/Status"
+      requirements:
+        description: "Computational requirements related to running the component.\
+          \ \n`cpus` specifies the maximum number of (logical) cpus a component is\
+          \ allowed to use., whereas\n`memory` specifies the maximum amount of memory\
+          \ a component is allowed to allicate. Memory units must be\nin B, KB, MB,\
+          \ GB, TB or PB for SI units (1000-base), or KiB, MiB, GiB, TiB or PiB for\
+          \ binary IEC units (1024-base)."
+        $ref: "defs_viash.yaml#/definitions/ComputationalRequirements"
+      repositories:
+        description: "(Pre-)defines repositories that can be used as repository in\
+          \ dependencies.\nAllows reusing repository definitions in case it is used\
+          \ in multiple dependencies."
+        type: "array"
+        items:
+          $ref: "defs_viash.yaml#/definitions/RepositoryWithName"
+      dependencies:
+        description: "Allows listing Viash components required by this Viash component"
+        type: "array"
+        items:
+          $ref: "defs_viash.yaml#/definitions/Dependency"
+      summary:
+        description: "A one-sentence summary of the component. This is only used for\
+          \ documentation."
+        type: "string"
+      runners:
+        description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\
+          \ - NextflowRunner\n"
+        type: "array"
+        items:
+          $ref: "defs_viash.yaml#/definitions/Runner"
+      name:
+        description: "Name of the component and the filename of the executable when\
+          \ built with `viash build`."
+        type: "string"
+      argument_groups:
+        description: "A grouping of the arguments, used to display the help message.\n\
+          \n - `name: foo`, the name of the argument group. \n - `description: Description\
+          \ of foo`, a description of the argument group. Multiline descriptions are\
+          \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\
+          \n"
+        type: "array"
+        items:
+          $ref: "defs_viash.yaml#/definitions/ArgumentGroup"
+      description:
+        description: "A description of the component. This is only used for documentation.\
+          \ Multiline descriptions are supported."
+        type: "string"
+      usage:
+        description: "A description on how to use the component. This will be displayed\
+          \ with `--help` under the 'Usage:' section."
+        type: "string"
+      info:
+        description: "Structured information. Can be any shape: a string, vector,\
+          \ map or even nested map."
+        type: "object"
+      version:
+        description: "Version of the component. This field will be used to version\
+          \ the executable and the Docker container."
+        type: "string"
+      links:
+        description: "External links of the component."
+        $ref: "defs_viash.yaml#/definitions/Links"
+      references:
+        description: "References to external resources related to the component."
+        $ref: "defs_viash.yaml#/definitions/References"
+      engines:
+        description: "A list of engine environments to execute target artifacts in.\n\
+          \n - NativeEngine\n - DockerEngine\n"
+        type: "array"
+        items:
+          $ref: "defs_viash.yaml#/definitions/Engine"
+      resources:
+        description: "Resources are files that support the component. The first resource\
+          \ should be a script that will be executed when the component is run. Additional\
+          \ resources will be copied to the same directory.\n\nCommon properties:\n\
+          \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\
+          \ / `scala_script` / `csharp_script`, specifies the type of the resource.\
+          \ The first resource cannot be of type `file`. When the type is not specified,\
+          \ the default type is simply `file`.\n * dest: filename, the resulting name\
+          \ of the resource.  From within a script, the file can be accessed at `meta[\"\
+          resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\
+          \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\
+          \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\
+          \ exclusive with `text`.\n * text: ...multiline text..., the content of\
+          \ the resulting file specified as a string. Mutually exclusive with `path`.\n\
+          \ * is_executable: `true` / `false`, whether the resulting resource file\
+          \ should be made executable.\n"
+        type: "array"
+        items:
+          $ref: "defs_viash.yaml#/definitions/Resource"
+      keywords:
+        description: "The keywords of the components."
+        type: "array"
+        items:
+          type: "string"
+      test_resources:
+        description: "One or more scripts to be used to test the component behaviour\
+          \ when `viash test` is invoked. Additional files of type `file` will be\
+          \ made available only during testing. Each test script should expect no\
+          \ command-line inputs, be platform-independent, and return an exit code\
+          \ >0 when unexpected behaviour occurs during testing. See Unit Testing for\
+          \ more info."
+        type: "array"
+        items:
+          $ref: "defs_viash.yaml#/definitions/Resource"
+      namespace:
+        description: "Namespace this component is a part of. See the Namespaces guide\
+          \ for more information on namespaces."
+        type: "string"
+      arguments:
+        description: "A list of arguments for this component. For each argument, a\
+          \ type and a name must be specified. Depending on the type of argument,\
+          \ different properties can be set. See these reference pages per type for\
+          \ more information:  \n\n - string\n - file\n - integer\n - double\n - boolean\n\
+          \ - boolean_true\n - boolean_false\n"
+        type: "array"
+        items:
+          $ref: "defs_viash.yaml#/definitions/Argument"
+      __merge__:
+        $ref: "#/definitions/Merge"
+    required:
+    - "name"
+    additionalProperties: false
+  Merge:
+    type: string
+    description: Path to a YAML file to inherit values from.
+    oneOf:
+      - type: array
+      - type: string
+  FileFormat:
+    description: 'File format metadata'
+    type: object
+    required: [label, file_format]
+    properties:
+      label:
+        $ref: "defs_common.yaml#/definitions/Label"
+      summary: 
+        $ref: "defs_common.yaml#/definitions/Summary"
+      file_format:
+        oneOf:
+          - type: object
+            required: [type]
+            additionalProperties: false
+            properties:
+              type:
+                const: h5ad
+              X:
+                $ref: "#/definitions/AnnDataSlot"
+              layers:
+                type: array
+                items:
+                  $ref: "#/definitions/AnnDataSlot"
+              var:
+                type: array
+                items:
+                  $ref: "#/definitions/AnnDataSlot"
+              varm:
+                type: array
+                items:
+                  $ref: "#/definitions/AnnDataSlot"
+              varp:
+                type: array
+                items:
+                  $ref: "#/definitions/AnnDataSlot"
+              obs:
+                type: array
+                items:
+                  $ref: "#/definitions/AnnDataSlot"
+              obsm:
+                type: array
+                items:
+                  $ref: "#/definitions/AnnDataSlot"
+              obsp:
+                type: array
+                items:
+                  $ref: "#/definitions/AnnDataSlot"
+              uns:
+                type: array
+                items:
+                  oneOf:
+                    - $ref: "#/definitions/AnnDataSlot"
+                    - $ref: "#/definitions/AnnDataSlotObject"
+          - type: object
+            required: [type, mod]
+            additionalProperties: false
+            properties:
+              type:
+                const: h5mu
+              mod:
+                type: object
+                additionalProperties: false
+                properties:
+                  rna:
+                    "#/definitions/AnnData"
+                  atac:
+                    "#/definitions/AnnData"
+                  prot:
+                    "#/definitions/AnnData"
+                  vdj:
+                    "#/definitions/AnnData"
+                  vdj_t:
+                    "#/definitions/AnnData"
+                  vdj_b:
+                    "#/definitions/AnnData"
+                  gdo:
+                    "#/definitions/AnnData"
+                  hto:
+                    "#/definitions/AnnData"
+  MuData:
+    required: [mod]
+    additionalProperties: false
+    properties:
+      mod:
+        type: object
+        additionalProperties: false
+        properties:
+          rna:
+            "#/definitions/AnnData"
+          atac:
+            "#/definitions/AnnData"
+          prot:
+            "#/definitions/AnnData"
+          vdj:
+            "#/definitions/AnnData"
+          vdj_t:
+            "#/definitions/AnnData"
+          vdj_b:
+            "#/definitions/AnnData"
+          gdo:
+            "#/definitions/AnnData"
+          hto:
+            "#/definitions/AnnData"
+  AnnData:
+    additionalProperties: false
+    properties:
+      X:
+        $ref: "#/definitions/AnnDataSlot"
+      layers:
+        type: array
+        items:
+          $ref: "#/definitions/AnnDataSlot"
+      var:
+        type: array
+        items:
+          $ref: "#/definitions/AnnDataSlot"
+      varm:
+        type: array
+        items:
+          $ref: "#/definitions/AnnDataSlot"
+      varp:
+        type: array
+        items:
+          $ref: "#/definitions/AnnDataSlot"
+      obs:
+        type: array
+        items:
+          $ref: "#/definitions/AnnDataSlot"
+      obsm:
+        type: array
+        items:
+          $ref: "#/definitions/AnnDataSlot"
+      obsp:
+        type: array
+        items:
+          $ref: "#/definitions/AnnDataSlot"
+      uns:
+        type: array
+        items:
+          oneOf:
+            - $ref: "#/definitions/AnnDataSlot"
+            - $ref: "#/definitions/AnnDataSlotObject"
+  AnnDataSlot:
+    properties:
+      type:
+        enum: [integer, double, string, boolean]
+      name:
+        type: string
+        description: A unique identifier.
+        pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$"
+      description:
+        type: string
+      required:
+        type: boolean
+    required: [type, name, description, required]
+  AnnDataSlotObject:
+    properties:
+      type:
+        enum: [object]
+      name:
+        type: string
+        description: A unique identifier.
+        pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$"
+      description:
+        type: string
+      required:
+        type: boolean
+    required: [type, name, description, required]
+
+  # added specific properties to the author info
+  Author:
+    description: Author metadata.
+    type: object
+    additionalProperties: false
+    properties:
+      name:
+        description: Full name of the author, usually in the name of FirstName MiddleName LastName.
+        type: string
+      info:
+        description: Additional information on the author
+        type: object
+        additionalProperties: false
+        required: [role, links, organizations]
+        properties:
+          links:
+            type: object
+            additionalProperties: false
+            required: github
+            properties:
+              github:
+                type: string
+              orcid:
+                type: string
+              email:
+                type: string
+              twitter:
+                type: string
+              linkedin:
+                type: string
+          role:
+            description: Role in the organisation
+            enum: ["Core Team Member", "Contributor"]
+          organizations:
+            type: array
+            minItems: 1
+            items:
+              type: object
+              additionalProperties: false
+              required: [name, href, role]
+              properties:
+                name:
+                  type: string
+                href:
+                  type: string
+                role:
+                  type: string
--- a/schemas/defs_viash.yaml
+++ b/schemas/defs_viash.yaml
--- a/schemas/file_format.yaml
+++ b/schemas/file_format.yaml
@@ -0,0 +1,56 @@
+
+title: File API
+description: A file format specification file.
+type: "object"
+properties:
+  info:
+    $ref: "defs_common.yaml#/definitions/FileFormat"
+  create_parent:
+    description: "If the output filename is a path and it does not exist, create\
+      \ it before executing the script (only for `direction: output`)."
+    type: "boolean"
+  default:
+    anyOf:
+    - description: "The default value when no argument value is provided. This\
+        \ will not work if the [`required`](#required) property is enabled."
+      type: "string"
+    - description: "The default value when no argument value is provided. This\
+        \ will not work if the [`required`](#required) property is enabled."
+      type: "array"
+      items:
+        type: "string"
+  example:
+    anyOf:
+    - description: "An example value for this argument. If no [`default`](#default)\
+        \ property was specified, this will be used for that purpose."
+      type: "string"
+    - description: "An example value for this argument. If no [`default`](#default)\
+        \ property was specified, this will be used for that purpose."
+      type: "array"
+      items:
+        type: "string"
+  description:
+    description: "A description of the argument. This will be displayed with `--help`."
+    type: "string"
+  multiple_sep:
+    description: "The delimiter character for providing [`multiple`](#multiple)\
+      \ values. `:` by default."
+    type: "string"
+  multiple:
+    description: "Treat the argument value as an array. Arrays can be passed using\
+      \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\
+      \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\
+      \ property. `false` by default."
+    type: "boolean"
+  type:
+    description: "A `file` type argument has a string value that points to a file\
+      \ or folder path."
+    const: "file"
+  required:
+    description: "Make the value for this argument required. If set to `true`,\
+      \ an error will be produced if no value was provided. `false` by default."
+    type: "boolean"
+  __merge__:
+    $ref: "defs_common.yaml#/definitions/Merge"
+required:  [type, info]
+additionalProperties: false
--- a/schemas/package_config.yaml
+++ b/schemas/package_config.yaml
@@ -0,0 +1 @@
+$ref: "defs_viash.yaml#/definitions/PackageConfig"
--- a/schemas/viash_config.yaml
+++ b/schemas/viash_config.yaml
@@ -0,0 +1,2 @@
+oneOf:
+  - $ref: "defs_common.yaml#/definitions/Config"
--- a/src/annotate/celltypist/config.vsh.yaml
+++ b/src/annotate/celltypist/config.vsh.yaml
@@ -0,0 +1,150 @@
+name: celltypist
+namespace: annotate
+description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: The layer in the input data to be used for cell type annotation if .X is not to be used. 
+      - name: "--var_query_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description: The name of the adata obs column in the reference data containing cell type annotations.
+        default: "cell_ontology_class"
+      - name: "--check_expression"
+        type: boolean_true
+        description: | 
+          Whether to check the expression of the reference dataset to the format reccomended by CellTypist.
+          CellTypist requires data to be log-normalized to 10000 counts per cell.
+      - name: "--var_reference_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
+  - name: Model arguments
+    description: Model arguments.
+    arguments:
+      - name: "--model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+      - name: "--feature_selection"
+        type: boolean
+        description: "Whether to perform feature selection."
+        default: false
+      - name: "--majority_voting"
+        type: boolean
+        description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
+        default: false
+      - name: "--C"
+        type: double
+        description: "Inverse of regularization strength in logistic regression."
+        default: 1.0
+      - name: "--max_iter"
+        type: integer
+        description: "Maximum number of iterations before reaching the minimum of the cost function."
+        default: 1000
+      - name: "--use_SGD"
+        type: boolean_true
+        description: "Whether to use the stochastic gradient descent algorithm."
+      - name: "--min_prop"
+        type: double
+        description: |
+          "For the dominant cell type within a subcluster, the minimum proportion of cells required to 
+          support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. 
+          Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
+        default: 0
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_compression"
+        type: string
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+      - name: "--output_obs_predictions"
+        type: string
+        default: celltypist_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: celltypist_probability
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+    
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+  - type: docker
+    image: python:3.10-slim
+    setup:
+      - type: apt
+        packages:
+          - libhdf5-dev
+          - procps
+      - type: python
+        __merge__: [ /src/base/requirements/scanpy.yaml, .]
+      - type: python
+        packages:
+          - celltypist==1.6.3
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/annotate/celltypist/script.py
+++ b/src/annotate/celltypist/script.py
@@ -0,0 +1,115 @@
+import sys
+import logging
+import celltypist
+import mudata as mu
+import re
+import numpy as np
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_log_normalized.h5mu",
+    "output": "output.h5mu",
+    "modality": "rna",
+    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    "model": None,
+    "reference_obs_target": "cell_ontology_class",
+    "check_expression": False,
+    "feature_selection": True,
+    "majority_voting": True,
+    "output_compression": "gzip",
+    "var_query_gene_names": None,
+    "var_reference_gene_names": "ensemblid",
+    "input_layer": None,
+    "reference_layer": None,
+    "output_obs_predictions": "celltypist_pred",
+    "output_obs_probabilities": "celltypist_probability",
+}
+meta = {
+}
+## VIASH END
+
+# START TEMPORARY WORKAROUND setup_logger
+def setup_logger():
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(sys.stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+
+def check_celltypist_format(indata):
+    if np.abs(np.expm1(indata[0]).sum()-10000) > 1:
+        return False
+    return True
+
+def set_var_index(adata, var_name):
+    adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
+    return adata
+
+def main(par):
+    
+    if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]):
+        raise ValueError("Make sure to provide either 'model' or 'reference', but not both.")
+    
+    logger = setup_logger()
+
+    input_mudata = mu.read_h5mu(par["input"])
+    input_modality = input_mudata.mod[par["modality"]].copy()
+    
+    # Set var names to the desired gene name format (gene synbol, ensembl id, etc.)
+    # CellTypist requires query gene names to be in the same format as the reference data.
+    input_modality = set_var_index(input_modality, par["var_query_gene_names"]) if par["var_query_gene_names"] else input_modality
+
+    if par["model"]:
+        logger.info("Loading CellTypist model")
+        model = celltypist.models.Model.load(par["model"])
+    
+    elif par["reference"]:
+        reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]]
+                
+        if par["var_reference_gene_names"]:
+            reference_modality = set_var_index(reference_modality, par["var_reference_gene_names"])
+                    
+        logger.info("Detecting common vars")
+        common_ens_ids = reference_modality.var.index.intersection(input_modality.var.index)
+        
+        logger.info("  reference n_vars: %i", reference_modality.n_vars)
+        logger.info("  input n_vars: %i", input_modality.n_vars)
+        logger.info("  intersect n_vars: %i", len(common_ens_ids))
+        assert len(common_ens_ids) >= 100, "The intersection of genes is too small."
+        
+        input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X
+        reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X
+
+        if not check_celltypist_format(input_matrix):
+            logger.warning("Input data is not in the reccommended format for CellTypist.")
+        if not check_celltypist_format(reference_matrix):
+            logger.warning("Reference data is not in the reccommended format for CellTypist.")
+        
+        labels = reference_modality.obs[par["reference_obs_target"]]
+        
+        logger.info("Training CellTypist model on reference") 
+        model = celltypist.train(reference_matrix,
+                                 labels=labels,
+                                 genes=reference_modality.var.index,
+                                 C=par["C"],
+                                 max_iter=par["max_iter"],
+                                 use_SGD=par["use_SGD"],
+                                 feature_selection=par["feature_selection"],
+                                 check_expression=par["check_expression"])
+            
+    logger.info("Predicting CellTypist annotations")
+    predictions = celltypist.annotate(input_modality,
+                                      model,
+                                      majority_voting=par["majority_voting"])
+    input_modality.obs[par["output_obs_predictions"]] = predictions.predicted_labels["predicted_labels"]
+    input_modality.obs[par["output_obs_probability"]] = predictions.probability_matrix.max(axis=1).values
+    
+    input_mudata.mod[par["modality"]] = input_modality
+    input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
+    
+if __name__ == '__main__':
+    main(par)
--- a/src/annotate/celltypist/test.py
+++ b/src/annotate/celltypist/test.py
@@ -0,0 +1,148 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+import scanpy as sc
+import anndata as ad
+from openpipelinetestutils.asserters import assert_annotation_objects_equal
+## VIASH START
+meta = {
+    "resources_dir": "resources_test"
+}
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
+model_file = f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl"
+celltypist_input_file = f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu"
+
+@pytest.fixture
+def normalize_log_transform(random_h5mu_path):
+    def wrapper(input_mudata_file, modality, target_sum=1e4):
+        input_mudata = mu.read_h5mu(input_mudata_file)
+        input_adata = input_mudata.mod[modality]
+        adata = input_adata.copy()
+        input_layer = adata.X
+        data_for_scanpy = ad.AnnData(X=input_layer.copy())
+        sc.pp.normalize_total(data_for_scanpy, target_sum=target_sum)
+        sc.pp.log1p(data_for_scanpy,
+                    base=None,
+                    layer=None, # use X
+                    copy=False) # allow overwrites in the copy that was made
+        adata.X = data_for_scanpy.X
+        adata.uns['log1p'] = data_for_scanpy.uns['log1p'].copy()
+        input_mudata.mod[modality] = adata
+        transformed_input_mudata_file = random_h5mu_path()
+        input_mudata.write_h5mu(transformed_input_mudata_file)
+        return transformed_input_mudata_file
+    return wrapper
+
+def test_simple_execution(run_component, random_h5mu_path, normalize_log_transform):
+    output_file = random_h5mu_path()
+    input_file_transformed = normalize_log_transform(input_file, "rna")
+
+    run_component([
+        "--input", input_file_transformed,
+        "--reference", reference_file,
+        "--reference_obs_targets", "cell_ontology_class",
+        "--var_reference_gene_names", "ensemblid",
+        "--output", output_file
+    ])
+    
+    assert os.path.exists(output_file), "Output file does not exist"
+    
+    input_mudata = mu.read_h5mu(input_file_transformed)
+    output_mudata = mu.read_h5mu(output_file)
+    
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+    
+    assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs"
+    
+    obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]"
+    
+def test_set_params(run_component, random_h5mu_path, normalize_log_transform):
+    output_file = random_h5mu_path()
+    input_file_transformed = normalize_log_transform(input_file, "rna")
+
+    run_component([
+        "--input", input_file_transformed,
+        "--reference", reference_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--var_reference_gene_names", "ensemblid",
+        "--feature_selection", "True",
+        "--majority_voting", "True",
+        "--C", "0.5",
+        "--max_iter", "100",
+        "--use_SGD",
+        "--min_prop", "0.1",
+        "--input_layer", "log_normalized",
+        "--output", output_file,
+        "--output_compression", "gzip",
+    ])
+    
+    assert os.path.exists(output_file), "Output file does not exist"
+    
+    input_mudata = mu.read_h5mu(input_file_transformed)
+    output_mudata = mu.read_h5mu(output_file)
+    
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+    
+    assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs"
+    
+    obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]"
+
+def test_with_model(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", celltypist_input_file,
+        "--model", model_file,
+        "--reference_obs_targets", "cell_type",
+        "--output", output_file
+    ])
+    
+    assert os.path.exists(output_file), "Output file does not exist"
+    
+    output_mudata = mu.read_h5mu(output_file)
+    
+    assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs"
+    
+    obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]"
+
+def test_fail_check_reference_expression(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component([
+            "--input", input_file,
+            "--reference", reference_file,
+            "--var_reference_gene_names", "ensemblid",
+            "--output", output_file,
+            "--check_expression"
+        ])
+    assert re.search(r"Invalid expression matrix, expect log1p normalized expression to 10000 counts per cell",
+            err.value.stdout.decode('utf-8'))
+    
+def test_fail_invalid_input_expression(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component([
+            "--input", input_file,
+            "--reference", reference_file,
+            "--var_reference_gene_names", "ensemblid",
+            "--output", output_file
+        ])
+    assert re.search(r"Invalid expression matrix in `.X`, expect log1p normalized expression to 10000 counts per cell",
+            err.value.stdout.decode('utf-8'))
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
+        
--- a/src/annotate/onclass/config.vsh.yaml
+++ b/src/annotate/onclass/config.vsh.yaml
@@ -0,0 +1,136 @@
+name: onclass
+namespace: annotate
+description: |
+  OnClass is a python package for single-cell cell type annotation. It uses the Cell Ontology to capture the cell type similarity. 
+  These similarities enable OnClass to annotate cell types that are never seen in the training data.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: The layer in the input data to be used for cell type annotation if .X is not to be used.
+        required: false
+      - name: "--cl_nlp_emb_file"
+        type: file
+        description: The .nlp.emb file with the cell type embeddings.
+        required: true
+      - name: "--cl_ontology_file"
+        type: file
+        description: The .ontology file with the cell type ontology.
+        required: true
+      - name: "--cl_obo_file"
+        type: file
+        description: The .obo file with the cell type ontology.
+        required: true
+      - name: "--var_query_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data to be used for cell type annotation if .X is not to be used.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description: The name of the adata obs column in the reference data containing cell type annotations.
+        example: "cell_ontology_class"
+        required: true
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_compression"
+        type: string
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+      - name: "--output_obs_predictions"
+        type: string
+        default: onclass_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: onclass_prob 
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+  - name: Model arguments
+    description: Model arguments
+    arguments:
+      - name: "--model"
+        type: string
+        description: | 
+          "Pretrained model path without a file extension. If not provided, the model will be trained 
+          on the reference data and --reference should be provided. The path namespace should contain:
+            - a .npz or .pkl file
+            - a .data file
+            - a .meta file
+            - a .index file
+          e.g. /path/to/model/pretrained_model_target1 as saved by OnClass."
+        required: false
+        direction: input
+      - name: "--max_iter"
+        type: integer
+        default: 30
+        required: false
+        description: Maximum number of iterations for training the model.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+  - type: docker
+    image: python:3.8
+    setup:
+      - type: python
+        packages:
+          - scikit-learn==0.24.0
+          - OnClass==1.2
+          - tensorflow==2.13.1
+          - obonet==1.1.0
+          - mudata
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/annotate/onclass/script.py
+++ b/src/annotate/onclass/script.py
@@ -0,0 +1,196 @@
+import sys
+import logging
+import mudata as mu
+import anndata as ad
+import re
+import numpy as np
+from OnClass.OnClassModel import OnClassModel
+import obonet
+from typing import Dict, Tuple
+from tqdm import tqdm
+
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    "output": "output.h5mu",
+    "modality": "rna",
+    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    "model": None,
+    "reference_obs_targets": "cell_ontology_class",
+    "input_layer": None,
+    "reference_layer": None,
+    "max_iter": 100,
+    "output_obs_predictions": None,
+    "output_obs_probability": None,
+    "cl_nlp_emb_file": "resources_test/annotation_test_data/ontology/cl.ontology.nlp.emb",
+    "cl_ontology_file": "resources_test/annotation_test_data/ontology/cl.ontology",
+    "cl_obo_file": "resources_test/annotation_test_data/ontology/cl.obo",
+    "output_compression": "gzip"
+}
+meta = {"resources_dir": "src/annotate/onclass"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+def setup_logger():
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(sys.stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+
+logger = setup_logger()
+
+def map_celltype_to_ontology_id(cl_obo_file: str) -> Tuple[Dict[str, str], Dict[str, str]]:
+    """
+    Map cell type names to ontology IDs and vice versa.
+    
+    Parameters
+    ----------
+    cl_obo_file : str
+        Path to the cell ontology file.
+        
+    Returns
+    -------
+    Tuple[Dict[str, str], Dict[str, str]]
+        A tuple of two dictionaries. The first dictionary maps cell ontology IDs to cell type names.
+        The second dictionary maps cell type names to cell ontology IDs.
+    """
+    graph = obonet.read_obo(cl_obo_file)
+    cl_id_to_name = {id_: data.get("name") for id_, data in graph.nodes(data=True)}
+    cl_id_to_name = {k: v for k, v in cl_id_to_name.items() if v is not None}
+    name_to_cl_id = {v: k for k, v in cl_id_to_name.items()}
+    return cl_id_to_name, name_to_cl_id
+
+def predict_input_data(model: OnClassModel,
+                       input_matrix: np.array,
+                       input_modality: ad.AnnData,
+                       id_to_name: dict,
+                       obs_prediction: str,
+                       obs_probability: str) -> ad.AnnData:
+    """
+    Predict cell types for input data and save results to Anndata obj.
+    
+    Parameters
+    ----------
+    model : OnClassModel
+        The OnClass model.
+    input_matrix : np.array
+        The input data matrix.
+    input_modality : ad.AnnData
+        The input data Anndata object.
+    id_to_name : dict
+        Dictionary mapping cell ontology IDs to cell type names.
+    obs_prediction : str
+        The obs key for the predicted cell type.
+    obs_probability : str
+        The obs key for the predicted cell type probability.
+        
+    Returns
+    -------
+    ad.AnnData
+        The input data Anndata object with the predicted cell types saved in obs.
+    """
+    corr_test_feature = model.ProcessTestFeature(
+        test_feature=input_matrix,
+        test_genes=input_modality.var_names,
+        log_transform=False,
+    )
+    onclass_pred = model.Predict(corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0)
+    pred_label = [model.i2co[ind] for ind in onclass_pred[2]]
+    pred_cell_type_label = [id_to_name[id] for id in pred_label]
+    
+    input_modality.obs[obs_prediction] = pred_cell_type_label
+    input_modality.obs[obs_probability] = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1)
+    return input_modality
+
+def set_var_index(adata, var_name):
+    adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
+    return adata
+
+def main():
+    
+    if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]):
+        raise ValueError("Make sure to provide either 'model' or 'reference', but not both.")
+    
+    logger.info("Reading input data")
+    input_mudata = mu.read_h5mu(par["input"])
+    input_modality = input_mudata.mod[par["modality"]].copy()
+    
+    # Set var names to the desired gene name format (gene synbol, ensembl id, etc.)
+    input_modality = set_var_index(input_modality, par["var_query_gene_names"]) if par["var_query_gene_names"] else input_modality
+    input_matrix = input_modality.layers[par["input_layer"]].toarray() if par["input_layer"] else input_modality.X.toarray()
+
+    id_to_name, name_to_id = map_celltype_to_ontology_id(par["cl_obo_file"])
+    
+
+    if par["model"]:
+        logger.info("Predicting cell types using pre-trained model")
+        model = OnClassModel(cell_type_nlp_emb_file=par["cl_nlp_emb_file"],
+                             cell_type_network_file=par["cl_ontology_file"])
+        
+        model.BuildModel(use_pretrain=par["model"], ngene=None)
+    
+    
+    elif par["reference"]:
+        logger.info("Reading reference data")
+        model = OnClassModel(cell_type_nlp_emb_file=par["cl_nlp_emb_file"],
+                             cell_type_network_file=par["cl_ontology_file"])
+        
+        reference_mudata = mu.read_h5mu(par["reference"])
+        reference_modality = reference_mudata.mod[par["modality"]].copy()
+
+        reference_modality.var["gene_symbol"] = list(reference_modality.var.index)
+        reference_modality.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_modality.var["ensemblid"]]
+
+        logger.info("Detecting common vars based on ensembl ids")
+        common_ens_ids = list(set(reference_modality.var.index).intersection(set(input_modality.var.index)))
+
+        logger.info("  reference n_vars: %i", reference_modality.n_vars)
+        logger.info("  input n_vars: %i", input_modality.n_vars)
+        logger.info("  intersect n_vars: %i", len(common_ens_ids))
+        assert len(common_ens_ids) >= 100, "The intersection of genes is too small."
+
+        reference_matrix = reference_modality.layers[par["reference_layer"]].toarray() if par["reference_layer"] else reference_modality.X.toarray()
+
+        logger.info("Training a model from reference...")
+        labels = reference_modality.obs[par["reference_obs_target"]].tolist()
+        labels_cl = [name_to_id[label] for label in labels]
+        _ = model.EmbedCellTypes(labels_cl)
+        (
+            corr_train_feature,
+            _,
+            corr_train_genes,
+            _,
+        ) = model.ProcessTrainFeature(
+            train_feature=reference_matrix,
+            train_label=labels_cl,
+            train_genes=reference_modality.var_names,
+            test_feature=input_matrix,
+            test_genes=input_modality.var_names,
+            log_transform=False,
+        )
+        model.BuildModel(ngene=len(corr_train_genes))
+        model.Train(corr_train_feature,
+                    labels_cl,
+                    max_iter=par["max_iter"])
+        
+    
+    logger.info(f"Predicting cell types")
+    input_modality = predict_input_data(model,
+                                        input_matrix,
+                                        input_modality,
+                                        id_to_name,
+                                        par["output_obs_predictions"],
+                                        par["output_obs_probability"])
+    logger.info("Writing output data")
+    input_mudata.mod[par["modality"]] = input_modality
+    input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
+    
+if __name__ == "__main__":
+    main()
--- a/src/annotate/onclass/test.py
+++ b/src/annotate/onclass/test.py
@@ -0,0 +1,146 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+import anndata as ad
+from openpipelinetestutils.asserters import assert_annotation_objects_equal
+import os
+## VIASH START
+meta = {
+    "resources_dir": "resources_test"
+}
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
+cl_nlp_emb_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology.nlp.emb"
+cl_ontology_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology"
+cl_obo_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.obo"
+model_file = f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model"
+
+
+@pytest.fixture
+def swap_gene_symbol(random_h5mu_path):
+    def wrapper(input_mudata_file, modality):
+        input_mudata = mu.read_h5mu(input_mudata_file)
+        input_adata = input_mudata.mod[modality]
+        adata = input_adata.copy()
+        adata.var["ensemblid"] = list(adata.var.index)
+        adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var["gene_symbol"]]
+        input_mudata.mod[modality] = adata
+        swapped_input_mudata_file = random_h5mu_path()
+        input_mudata.write_h5mu(swapped_input_mudata_file)
+        return swapped_input_mudata_file
+    return wrapper
+
+
+def test_simple_execution(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+    
+    run_component([
+        "--input", input_file,
+        "--reference", reference_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--cl_nlp_emb_file", cl_nlp_emb_file,
+        "--cl_ontology_file", cl_ontology_file,
+        "--cl_obo_file", cl_obo_file,
+        "--max_iter", "10",
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ['onclass_pred',
+                                                         'onclass_prob']
+
+    obs_values = output_mudata.mod["rna"].obs["onclass_prob"]
+    assert all(0 <= value <= 1 for value in obs_values), ".obs at cell_ontology_class_prob has values outside the range [0, 1]"
+
+
+def test_custom_obs(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+    
+    run_component([
+        "--input", input_file,
+        "--reference", reference_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--output_obs_predictions", "dummy_pred_1",
+        "--output_obs_probability", "dummy_prob_1",
+        "--cl_nlp_emb_file", cl_nlp_emb_file,
+        "--cl_ontology_file", cl_ontology_file,
+        "--cl_obo_file", cl_obo_file,
+        "--max_iter", "10",
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert set(output_mudata.mod["rna"].obs.keys()) == {'dummy_pred_1', 'dummy_prob_1'}
+
+    obs_keys = ['dummy_prob_1']
+    for key in obs_keys:
+        obs_values = output_mudata.mod["rna"].obs[key]
+        assert all(0 <= value <= 1 for value in obs_values), f".obs at {key} has values outside the range [0, 1]"
+        
+        
+def test_no_model_no_reference_error(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+    
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component([
+            "--input", input_file,
+            "--output", output_file,
+            "--cl_nlp_emb_file", cl_nlp_emb_file,
+            "--cl_ontology_file", cl_ontology_file,
+            "--cl_obo_file", cl_obo_file,
+            "--reference_obs_target", "cell_ontology_class"
+        ])
+    assert re.search(r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
+            err.value.stdout.decode('utf-8'))
+    
+    
+def test_pretrained_model(run_component, random_h5mu_path, swap_gene_symbol):
+    output_file = random_h5mu_path()
+    swapped_input_file = swap_gene_symbol(input_file, "rna")
+    
+    run_component([
+        "--input", swapped_input_file,
+        "--cl_nlp_emb_file", cl_nlp_emb_file,
+        "--cl_ontology_file", cl_ontology_file,
+        "--cl_obo_file", cl_obo_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--model", model_file,
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ['onclass_pred',
+                                                         'onclass_prob']
+
+    obs_values = output_mudata.mod["rna"].obs["onclass_prob"]
+    assert all(0 <= value <= 1 for value in obs_values), ".obs at cell_ontology_class_prob has values outside the range [0, 1]"
+
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/popv/config.vsh.yaml
+++ b/src/annotate/popv/config.vsh.yaml
@@ -0,0 +1,165 @@
+name: popv
+namespace: "annotate"
+description: "Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV."
+authors:
+  - __merge__: /src/authors/matthias_beyens.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/robrecht_cannoodt.yaml
+    roles: [ author ]
+argument_groups:
+  - name: Inputs
+    description: Arguments related to the input (aka query) dataset.
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: Input h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`.
+        required: false
+      - name: "--input_obs_batch"
+        type: string
+        description: Key in obs field of input adata for batch information. If no value is provided, batch label is assumed to be unknown.
+        required: false
+      - name: "--input_var_subset"
+        type: string
+        description: Subset the input object with this column.
+        required: false
+      - name: "--input_obs_label"
+        type: string
+        description: Key in obs field of input adata for label information. This is only used for training scANVI. Unlabelled cells should be set to `"unknown_celltype_label"`.
+        required: false
+      - name: "--unknown_celltype_label"
+        type: string
+        description: If `input_obs_label` is specified, cells with this value will be treated as unknown and will be predicted by the model.
+        default: "unknown"
+        required: false
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "User-provided reference tissue. The data that will be used as reference to call cell types."
+        example: TS_Bladder_filtered.h5ad
+        direction: input
+        required: true
+      - name: "--reference_layer"
+        type: string
+        description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`.
+        required: false
+      - name: "--reference_obs_label"
+        type: string
+        description: Key in obs field of reference AnnData with cell-type information.
+        default: "cell_ontology_class"
+        required: false
+      - name: "--reference_obs_batch"
+        type: string
+        description: Key in obs field of input adata for batch information.
+        default: "donor_assay"
+        required: false
+      # - name: "--reference_models"
+      #   type: file
+      #   description: Pretrained models. Can be a directory or a tar gz.
+      #   required: false
+      #   example: pretrained_models_Bladder_ts.tar.gz
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        required: true
+        example: output.h5mu
+      - name: "--output_compression"
+        type: string
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+      # - name: "--output_models"
+      #   type: file
+      #   direction: output
+      #   description: If `prediction_mode == "retrain"`, saves models to a directory and compresses the results into a tar gz.
+      #   example: "output.tar.gz"
+      #   required: false
+  - name: Arguments
+    description: Other arguments.
+    arguments:
+      - name: "--methods"
+        type: string
+        description: "Methods to call cell types. By default, runs to knn_on_scvi and scanvi."
+        example: ["knn_on_scvi", "scanvi"]
+        choices: [celltypist, knn_on_bbknn, knn_on_scanorama, knn_on_scvi, onclass, rf, scanvi, svm]
+        required: true
+        multiple: true
+      # - name: "--prediction_mode"
+      #   type: string
+      #   description: |
+      #     Execution mode of cell-type annotation.
+      #     "retrain": Train all prediction models and saves them to disk. Argument `output_models` must be defined.
+      #     "inference": Classify all cells based on pretrained models. Argument `reference_models` must be defined.
+      #     "fast": Fast inference using only query cells and single epoch in scArches.
+      # - name: "--plots"
+      #   type: boolean
+      #   description: "Creation of agreement and frequency plots between selected cell type algorithmn(s) and final PopV ensemble called cell type."
+      #   default: false
+      #   required: false
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+  - type: docker
+    #image: nvcr.io/nvidia/pytorch:22.12-py3
+    image: python:3.9-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+          - git
+          - build-essential
+          - wget
+      - type: python
+        __merge__: [ /src/base/requirements/scanpy.yaml, .]
+        packages:
+          - scvi-tools~=1.0.3
+          - popv~=0.3.2
+          - jax==0.4.10
+          - jaxlib==0.4.10
+          - ml-dtypes<0.3.0
+          - scipy==1.12.0
+      # These need to be updated AFTER popv is installed.
+      # See https://github.com/YosefLab/PopV/issues/30
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+      # download ontology required by popv
+      - type: docker
+        run: |
+          cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git && \
+            cd PopV && git fetch --depth 1 origin tag v0.2 && git checkout v0.2
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml, .]
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      # TODO: should add new label highmem-single-gpu and lowmem-single-gpu
+      label: [highmem, highcpu]
--- a/src/annotate/popv/script.py
+++ b/src/annotate/popv/script.py
@@ -0,0 +1,223 @@
+import sys
+import re
+import tempfile
+import typing
+import numpy as np
+import mudata as mu
+import anndata as ad
+import popv
+
+# todo: is this still needed?
+from torch.cuda import is_available as cuda_is_available
+try:
+    from torch.backends.mps import is_available as mps_is_available
+except ModuleNotFoundError:
+    # Older pytorch versions
+    # MacOS GPUs
+    def mps_is_available():
+        return False
+
+# where to find the obo files
+cl_obo_folder = "/opt/PopV/ontology/"
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    # "input": "resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu",
+    "modality": "rna",
+    "reference": "resources_test/annotation_test_data/tmp_TS_Blood_filtered.h5ad",
+    "input_obs_batch": None,
+    "input_layer": None,
+    "input_obs_label": None,
+    "input_var_subset": None,
+    "unknown_celltype_label": "unknown",
+    "reference_layer": None,
+    "reference_obs_label": "cell_ontology_class",
+    "reference_obs_batch": "donor_assay",
+    "output": "output.h5mu",
+    "output_compression": "gzip",
+    "methods": [
+        # "celltypist",
+        # "knn_on_bbknn",
+        # "knn_on_scanorama",
+        # "knn_on_scvi",
+        "rf",
+        # "scanvi",
+        "svm",
+    ]
+}
+meta = {}
+# for debugging the obo folder can be somewhere local
+cl_obo_folder = "popv_cl_ontology/"
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+# reason: resources aren't available when using Nextflow fusion
+# from setup_logger import setup_logger
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+logger = setup_logger()
+
+use_gpu = cuda_is_available() or mps_is_available()
+logger.info("GPU enabled? %s", use_gpu)
+
+# Helper functions
+def get_X(adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str]):
+    """Fetch the counts data from X or a layer. Subset columns by var_index if so desired."""
+    if var_index:
+        adata = adata[:, var_index]
+    if layer:
+        return adata.layers[layer]
+    else:
+        return adata.X
+def get_obs(adata: ad.AnnData, obs_par_names):
+    """Subset the obs dataframe to just the columns defined by the obs_label and obs_batch."""
+    obs_columns = [par[x] for x in obs_par_names if par[x]]
+    return adata.obs[obs_columns]
+def get_var(adata: ad.AnnData, var_index: list[str]):
+    """Fetch the var dataframe. Subset rows by var_index if so desired."""
+    return adata.var.loc[var_index]
+
+def main(par, meta):
+    assert len(par["methods"]) >= 1, "Please, specify at least one method for cell typing."
+    logger.info("Cell typing methods: {}".format(par["methods"]))
+
+    ### PREPROCESSING REFERENCE ###
+    logger.info("### PREPROCESSING REFERENCE ###")
+    
+    # take a look at reference data
+    logger.info("Reading reference data '%s'", par["reference"])
+    reference = ad.read_h5ad(par["reference"])
+    
+    logger.info("Setting reference var index to Ensembl IDs")
+    reference.var["gene_symbol"] = list(reference.var.index)
+    reference.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference.var["ensemblid"]]
+
+    logger.info("Detect number of samples per label")
+    min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size())
+    n_samples_per_label = np.max((min_celltype_size, 100))
+
+    ### PREPROCESSING INPUT ###
+    logger.info("### PREPROCESSING INPUT ###")
+    logger.info("Reading '%s'", par["input"])
+    input = mu.read_h5mu(par["input"])
+    input_modality = input.mod[par["modality"]]
+
+    # subset with var column
+    if par["input_var_subset"]:
+        logger.info("Subset input with .var['%s']", par["input_var_subset"])
+        assert par["input_var_subset"] in input_modality.var, f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var"
+        input_modality = input_modality[:,input_modality.var[par["input_var_subset"]]]
+
+    ### ALIGN REFERENCE AND INPUT ###
+    logger.info("### ALIGN REFERENCE AND INPUT ###")
+
+    logger.info("Detecting common vars based on ensembl ids")
+    common_ens_ids = list(set(reference.var.index).intersection(set(input_modality.var.index)))
+    
+    logger.info("  reference n_vars: %i", reference.n_vars)
+    logger.info("  input n_vars: %i", input_modality.n_vars)
+    logger.info("  intersect n_vars: %i", len(common_ens_ids))
+    assert len(common_ens_ids) >= 100, "The intersection of genes is too small."
+
+    # subset input objects to make sure popv is using the data we expect
+    input_modality = ad.AnnData(
+        X = get_X(input_modality, par["input_layer"], common_ens_ids),
+        obs = get_obs(input_modality, ["input_obs_label", "input_obs_batch"]),
+        var = get_var(input_modality, common_ens_ids)
+    )
+    reference = ad.AnnData(
+        X = get_X(reference, par["reference_layer"], common_ens_ids),
+        obs = get_obs(reference, ["reference_obs_label", "reference_obs_batch"]),
+        var = get_var(reference, common_ens_ids)
+    )
+
+    # remove layers that 
+    
+    ### ALIGN REFERENCE AND INPUT ###
+    logger.info("### ALIGN REFERENCE AND INPUT ###")
+
+    with tempfile.TemporaryDirectory(prefix="popv-", dir=meta["temp_dir"]) as temp_dir:
+        logger.info("Run PopV processing")
+        pq = popv.preprocessing.Process_Query(
+            # input
+            query_adata=input_modality,
+            query_labels_key=par["input_obs_label"],
+            query_batch_key=par["input_obs_batch"],
+            query_layers_key=None, # this is taken care of by subset
+            # reference
+            ref_adata=reference,
+            ref_labels_key=par["reference_obs_label"],
+            ref_batch_key=par["reference_obs_batch"],
+            # options
+            unknown_celltype_label=par["unknown_celltype_label"],
+            n_samples_per_label=n_samples_per_label,
+            # pretrained model
+            # Might need to be parameterized at some point
+            prediction_mode="retrain",
+            pretrained_scvi_path=None,
+            # outputs
+            # Might need to be parameterized at some point
+            save_path_trained_models=temp_dir,
+            # hardcoded values
+            cl_obo_folder=cl_obo_folder,
+            use_gpu=use_gpu
+        )
+        method_kwargs = {}
+        if 'scanorama' in par['methods']:
+            method_kwargs['scanorama'] = {'approx': False}
+        logger.info("Annotate data")
+        popv.annotation.annotate_data(
+            adata=pq.adata,
+            methods=par["methods"],
+            methods_kwargs=method_kwargs
+        )
+
+    popv_input = pq.adata[input_modality.obs_names]
+
+    # select columns starting with "popv_"
+    popv_obs_cols = popv_input.obs.columns[popv_input.obs.columns.str.startswith("popv_")]
+
+    # create new data frame with selected columns
+    df_popv = popv_input.obs[popv_obs_cols]
+
+    # remove prefix from column names
+    df_popv.columns = df_popv.columns.str.replace("popv_", "")
+
+    # store output in mudata .obsm
+    input.mod[par["modality"]].obsm["popv_output"] = df_popv
+
+    # copy important output in mudata .obs
+    for col in ["popv_prediction"]:
+        if col in popv_input.obs.columns:
+            input.mod[par["modality"]].obs[col] = popv_input.obs[col]
+
+    # code to explore how the output differs from the original
+    # for attr in ["obs", "var", "uns", "obsm", "layers", "obsp"]:
+    #     old_keys = set(getattr(pq_adata_orig, attr).keys())
+    #     new_keys = set(getattr(pq.adata, attr).keys())
+    #     diff_keys = list(new_keys.difference(old_keys))
+    #     diff_keys.sort()
+    #     print(f"{attr}:", flush=True)
+    #     for key in diff_keys:
+    #         print(f"  {key}", flush=True)
+    
+    # write output
+    logger.info("Writing %s", par["output"])
+    input.write_h5mu(par["output"], compression=par["output_compression"])
+
+if __name__ == "__main__":
+    main(par, meta)
+
--- a/src/annotate/popv/test.py
+++ b/src/annotate/popv/test.py
@@ -0,0 +1,73 @@
+import sys
+import os
+import pytest
+import mudata as mu
+
+## VIASH START
+meta = {
+    "resources_dir": "resources_test"
+}
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad"
+
+def test_simple_execution(run_component):
+    output_file = "output.h5mu"
+
+    run_component([
+        "--input", input_file,
+        "--reference", reference_file,
+        "--output", "output.h5mu",
+        "--methods", "rf;svm"
+    ])
+    
+    # check whether file exists
+    assert os.path.exists(output_file), "Output file does not exist"
+    
+    # read output mudata
+    output = mu.read_h5mu(output_file)
+
+    # check output
+    expected_rna_obs_cols = ["popv_prediction"]
+    for col in expected_rna_obs_cols:
+        assert col in output.mod["rna"].obs.columns, f"could not find columns .mod['rna'].obs['{col}']"
+
+    print(f"output: {output}", flush=True)
+
+def test_popv_with_other_layer(run_component, tmp_path):
+    input_h5mu = mu.read(input_file)
+    input_h5mu.mod['rna'].layers['test'] = input_h5mu.mod['rna'].X.copy()
+    input_h5mu.write_h5mu(tmp_path / "input.h5mu")
+    run_component([
+        "--input", tmp_path / "input.h5mu",
+        "--reference", reference_file,
+        "--output", "output.h5mu",
+        "--methods", "rf;svm;knn_on_scanorama"
+    ])
+
+def test_popv_with_non_overlapping_cells(run_component, tmp_path):
+    input_h5mu = mu.read(input_file)
+    
+    # copy previous modalities
+    rna_ad = input_h5mu.mod["rna"].copy()
+    prot_ad = input_h5mu.mod["prot"].copy()
+
+    # change obs_names such that the cells do not overlap
+    rna_ad.obs_names = [f"rna_{x}" for x in rna_ad.obs_names]
+    prot_ad.obs_names = [f"prot_{x}" for x in prot_ad.obs_names]
+
+    # write new h5mu to file
+    new_h5mu = mu.MuData({"rna": rna_ad, "prot": prot_ad})
+    new_h5mu.write_h5mu(tmp_path / "input.h5mu")
+
+    # run component
+    run_component([
+        "--input", tmp_path / "input.h5mu",
+        "--reference", reference_file,
+        "--output", "output.h5mu",
+        "--methods", "rf;svm;knn_on_scanorama"
+    ])
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/random_forest_annotation/config.vsh.yaml
+++ b/src/annotate/random_forest_annotation/config.vsh.yaml
@@ -0,0 +1,140 @@
+name: random_forest_annotation
+namespace: annotate
+description: Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: The layer in the input data to be used for cell type annotation if .X is not to be used. 
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description: Key in obs field of reference modality with cell-type information.
+        required: true
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_compression"
+        type: string
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+      - name: "--output_obs_predictions"
+        type: string
+        default: random_forest_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: random_forest_probability
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+  - name: Model arguments
+    description: Model arguments.
+    arguments:
+      - name: "--model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+      - name: "--n_estimators"
+        type: integer
+        required: false
+        default: 100
+        description: Number of trees in the random forest.
+      - name: "--max_depth"
+        type: integer
+        required: false
+        description: |
+          Maximum depth of the trees in the random forest. 
+          If not provided, the nodes are expanded until all leaves only contain a single sample.
+      - name: "--criterion"
+        type: string
+        required: false
+        choices: ["gini", "entropy", "log_loss"]
+        default: "gini"
+        description: The function to measure the quality of a split.
+      - name: "--class_weight"
+        type: string
+        required: false
+        default: "balanced_subsample"
+        choices: ["balanced", "balanced_subsample", "uniform"]
+        description: |
+          Weights associated with classes.
+          The `balanced` mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data.
+          The `balanced_subsample` mode is the same as `balanced` except that weights are computed based on the bootstrap sample for every tree grown.
+          The `uniform` mode gives all classes a weight  of one.
+      - name: "--max_features"
+        type: string
+        default: "200"
+        description: |
+          The number of features to consider when looking for the best split. The value can either be a positive integer or one of `sqrt`, `log2` or `all`.
+          If integer: consider max_features features at each split.
+          If `sqrt`: max_features is the squareroot of all input features.
+          If `log2`: max_features is the log2 of all input features.
+          If `all`: max features equals all input features.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+  - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu
+
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - libhdf5-dev
+          - procps
+      - type: python
+        packages:
+          - scikit-learn==1.4.2
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/annotate/random_forest_annotation/script.py
+++ b/src/annotate/random_forest_annotation/script.py
@@ -0,0 +1,103 @@
+import sys
+import logging
+import mudata as mu
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+import pickle
+
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    "output": "output.h5mu",
+    "modality": "rna",
+    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    "model": None,
+    "reference_obs_target": "cell_ontology_class",
+    "input_layer": None,
+    "reference_layer": None,
+    "n_estimators": 100,
+    "criterion": "gini",
+    "max_depth": None,
+    "class_weight": None,
+    "max_features": 200,
+    "output_compression": "gzip",
+    "reference_layer": None,
+    "output_obs_predictions": "random_forest_pred",
+    "output_obs_probability": "random_forest_probability"
+}
+meta = {"resources_dir": "src/annotate/svm"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+def setup_logger():
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(sys.stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+
+logger = setup_logger()
+
+def main():
+    logger.info("Reading input data")
+    input_mudata = mu.read_h5mu(par["input"])
+    input_modality = input_mudata.mod[par["modality"]].copy()
+
+    input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X 
+    
+    # Handle max_features parameter
+    max_features_conversion = {
+        "all": None,
+        "sqrt": "sqrt",
+        "log2": "log2",  
+    }
+    try:
+        max_features = max_features_conversion.get(par["max_features"], int(par["max_features"]))
+    except ValueError:
+        raise ValueError(f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of \'sqrt\', \'log2\' or \'all\'")
+        
+    if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]):
+        raise ValueError("Make sure to provide either 'model' or 'reference', but not both.")
+    
+    if par["model"]:
+        logger.info("Loading a pre-trained model")
+        model = pickle.load(open(par["model"], "rb"))
+        
+    elif par["reference"]:
+        logger.info("Reading reference data")
+
+        reference_mudata = mu.read_h5mu(par["reference"])
+        reference_modality = reference_mudata.mod[par["modality"]].copy()
+
+        reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X
+
+        logger.info("Training a model...")
+        labels = reference_modality.obs[par["reference_obs_target"]].to_numpy()
+        model = RandomForestClassifier(
+            n_estimators=par["n_estimators"],
+            criterion=par["criterion"],
+            max_depth=par["max_depth"],
+            class_weight=par["class_weight"] if not par["class_weight"] == "uniform" else None,
+            max_features=max_features
+        )
+        model.fit(reference_matrix, labels)
+
+    logger.info("Running predictions...")
+    predictions = model.predict(input_matrix)
+    probabilities = np.max(model.predict_proba(input_matrix), axis=1)
+
+    input_modality.obs[par["output_obs_predictions"]] = predictions
+    input_modality.obs[par["output_obs_probability"]] = probabilities
+
+    logger.info("Writing output data")
+    input_mudata.mod[par["modality"]] = input_modality
+    input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
+
+if __name__ == "__main__":
+    main()
--- a/src/annotate/random_forest_annotation/test.py
+++ b/src/annotate/random_forest_annotation/test.py
@@ -0,0 +1,185 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+from openpipelinetestutils.asserters import assert_annotation_objects_equal
+import os
+from sklearn.ensemble import RandomForestClassifier
+import pickle
+
+## VIASH START
+meta = {
+    "resources_dir": "resources_test"
+}
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
+
+@pytest.fixture
+def subset_genes(random_h5mu_path):
+    def wrapper(input_mudata_file, reference_mudata_file, modality):
+        input_mudata = mu.read_h5mu(input_mudata_file)
+        input_adata = input_mudata.mod[modality]
+        reference_mudata = mu.read_h5mu(reference_mudata_file)
+        reference_adata = reference_mudata.mod[modality]
+
+        reference_mudata.var["gene_symbol"] = list(reference_mudata.var.index)
+        reference_mudata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_mudata.var["ensemblid"]]
+        reference_adata.var["gene_symbol"] = list(reference_adata.var.index)
+        reference_adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_adata.var["ensemblid"]]
+        common_ens_ids = list(set(reference_adata.var.index).intersection(set(input_adata.var.index)))
+
+        reference = reference_adata[:, common_ens_ids].copy()
+        query = input_adata[:, common_ens_ids].copy()
+
+        input_mudata.mod[modality] = query
+        reference_mudata.mod[modality] = reference
+
+        subset_input_mudata_file = random_h5mu_path()
+        subset_reference_mudata_file = random_h5mu_path()
+
+        input_mudata.write_h5mu(subset_input_mudata_file)
+        reference_mudata.write_h5mu(subset_reference_mudata_file)
+        return subset_input_mudata_file, subset_reference_mudata_file
+    return wrapper
+
+@pytest.fixture
+def dummy_model(tmp_path, subset_genes):
+    _, subset_reference_file = subset_genes(input_file, reference_file, "rna")
+    reference_modality = mu.read_h5mu(subset_reference_file).mod["rna"]
+
+    labels = reference_modality.obs["cell_ontology_class"].to_numpy()
+    model = RandomForestClassifier()
+    model.fit(reference_modality.X, labels)
+
+    model_path = tmp_path / "model.pkl"
+    with open(model_path, "wb") as f:
+        pickle.dump(model, f)
+
+    return model_path
+
+def test_simple_execution(run_component, random_h5mu_path, subset_genes):
+    subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", subset_input_file,
+        "--reference", subset_reference_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ['random_forest_pred',
+                                                         'random_forest_probability']
+
+    obs_values = output_mudata.mod["rna"].obs["random_forest_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
+
+def test_custom_out_obs_model_params(run_component, random_h5mu_path, subset_genes):
+    subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", subset_input_file,
+        "--reference", subset_reference_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--output_obs_predictions", "dummy_pred",
+        "--output_obs_probability", "dummy_probability",
+        "--n_estimators", "10",
+        "--criterion", "entropy",
+        "--max_depth", "5",
+        "--class_weight", "balanced",
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ['dummy_pred',
+                                                         'dummy_probability']
+
+    obs_values = output_mudata.mod["rna"].obs["dummy_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
+
+def test_with_model(run_component, random_h5mu_path, dummy_model, subset_genes):
+    subset_input_file, _ = subset_genes(input_file, reference_file, "rna")
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", subset_input_file,
+        "--model", dummy_model,
+        "--output", output_file,
+        "--reference_obs_target", "cell_ontology_class"
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ['random_forest_pred',
+                                                         'random_forest_probability']
+
+    obs_values = output_mudata.mod["rna"].obs["random_forest_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
+
+def test_no_model_no_reference_error(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component([
+            "--input", input_file,
+            "--output", output_file,
+            "--reference_obs_target", "cell_ontology_class"
+        ])
+    assert re.search(r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
+            err.value.stdout.decode('utf-8'))
+
+def test_model_and_reference_error(run_component, random_h5mu_path, dummy_model, subset_genes):
+    output_file = random_h5mu_path()
+    subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component([
+            "--input", subset_input_file,
+            "--output", output_file,
+            "--reference", subset_reference_file,
+            "--reference_obs_target", "cell_ontology_class",
+            "--model", dummy_model,
+        ])
+    assert re.search(r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
+            err.value.stdout.decode('utf-8'))
+
+def test_invalid_max_features(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component([
+            "--input", input_file,
+            "--output", output_file,
+            "--reference_obs_target", "cell_ontology_class",
+            "--max_features", "invalid_value"
+        ])
+    assert re.search(r"Invaldid value invalid_value for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'",
+            err.value.stdout.decode('utf-8'))
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/scanvi/config.vsh.yaml
+++ b/src/annotate/scanvi/config.vsh.yaml
@@ -0,0 +1,218 @@
+name: scanvi
+namespace: annotate
+description: Semi-supervised model for single-cell transcriptomics data. A scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+
+argument_groups:
+  - name: Inputs
+    description: Arguments related to the input (aka query) dataset.
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: Input h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: Reference h5mu file.
+        direction: input
+        required: true
+        example: reference.h5mu
+      - name: "--scvi_reference_model"
+        type: file
+        description: "Pretrained scvi reference model"
+        example: scvi_model.pt
+        direction: input
+        required: true
+      - name: "--reference_obs_label"
+        type: string
+        description: Key in obs field of reference AnnData with cell-type information.
+        example: "cell_ontology_class"
+        required: true
+
+  - name: SCANVI reference model training arguments
+    description: Arguments related to the reference SCANVI model.
+    arguments:
+    - name: "--reference_train_size"
+      type: double
+      description: Size of training set.
+      required: false
+      default: 0.9
+      min: 0.0
+      max: 1.0
+    - name: "--reference_max_epochs"
+      type: integer
+      description: Maximum number of epochs.
+      required: false
+      default: 400
+    - name: "--reference_learning_rate"
+      type: double
+      description: Learning rate.
+      required: false
+      default: 1e-3
+    - name: "--reference_reduce_lr_on_plateau"
+      type: boolean
+      description: Reduce learning rate on plateau.
+      required: false
+      default: true
+    - name: "--reference_lr_patience"
+      type: integer
+      description: Patience for learning rate reduction.
+      required: false
+      default: 25
+    - name: "--reference_lr_factor"
+      type: double
+      description: Factor by which to reduce learning rate.
+      required: false
+      default: 0.5
+      min: 0.0
+      max: 1.0
+    - name: "--reference_early_stopping"
+      type: boolean
+      description: Early stopping.
+      required: false
+      default: true
+    - name: "--reference_early_stopping_patience"
+      type: integer
+      description: Patience for early stopping.
+      required: false
+      default: 50
+
+  - name: SCANVI query model training arguments
+    description: Arguments related to the query SCANVI model.
+    arguments:
+    - name: "--query_train_size"
+      type: double
+      description: Size of training set.
+      required: false
+      default: 0.9
+      min: 0.0
+      max: 1.0
+    - name: "--query_max_epochs"
+      type: integer
+      description: Maximum number of epochs.
+      required: false
+      default: 400
+    - name: "--query_learning_rate"
+      type: double
+      description: Learning rate.
+      required: false
+      default: 1e-3
+    - name: "--query_reduce_lr_on_plateau"
+      type: boolean
+      description: Reduce learning rate on plateau.
+      required: false
+      default: true
+    - name: "--query_lr_patience"
+      type: integer
+      description: Patience for learning rate reduction.
+      required: false
+      default: 25
+    - name: "--query_lr_factor"
+      type: double
+      description: Factor by which to reduce learning rate.
+      required: false
+      default: 0.5
+      min: 0.0
+      max: 1.0
+    - name: "--query_early_stopping"
+      type: boolean
+      description: Early stopping.
+      required: false
+      default: true
+    - name: "--query_early_stopping_patience"
+      type: integer
+      description: Patience for early stopping.
+      required: false
+      default: 50
+
+  - name: Outputs
+    description: Arguments related to the output.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        required: true
+        example: output.h5mu
+      - name: "--output_compression"
+        type: string
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+      - name: "--output_model"
+        type: file
+        description: Folder where the state of the trained model will be saved to.
+        direction: output
+        required: false
+        example: model_dir
+      - name: "--output_obs_predictions"
+        type: string
+        description: |
+          In which `.obs` slots to store the predicted information.
+        default: "scanvi_pred"
+        required: false
+      - name: "--output_obs_probability"
+        type: string
+        default: "scanvi_probability"
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+      - name: "--output_obsm_scanvi_embedding"
+        type: string
+        default: "scanvi_embedding"
+        required: false
+        description: |
+          In which `.obsm` slots to store the scvi embedding.
+      - name: "--unknown_celltype"
+        type: string
+        default: "Unknown"
+        required: false
+        description: |
+          Label for unknown cell types.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+- type: docker
+  image: python:3.12-slim
+  setup:
+    - type: apt
+      packages:
+        - procps
+    - type: python
+      __merge__: [ /src/base/requirements/scanpy.yaml, .]
+    - type: python
+      packages:
+        - scvi-tools==1.1.5
+    - type: python
+      __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+  __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+  
--- a/src/annotate/scanvi/script.py
+++ b/src/annotate/scanvi/script.py
@@ -0,0 +1,104 @@
+import sys
+import mudata as mu
+import scvi
+import numpy as np
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu",
+    "modality": "rna",
+    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5ad",
+    "scvi_reference_model": "resources_test/annotation_test_data/scvi_model.pt",
+    "reference_obs_label": "cell_ontology_class",
+}
+meta = {}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+# reason: resources aren't available when using Nextflow fusion
+# from setup_logger import setup_logger
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+logger = setup_logger()
+
+logger.info("Reading the input and reference data")
+
+input_data = mu.read_h5mu(par["input"])
+query = input_data.mod[par["modality"]]
+reference_data = mu.read_h5mu(par["reference"])
+reference = reference_data.mod[par["modality"]]
+
+logger.info(f"Loading the pretrained scVI model from {par['scvi_reference_model']}")
+scvi_reference_model = scvi.model.SCVI.load(par["scvi_reference_model"], reference)
+
+logger.info("Setting up scANVI model")
+
+scanvi_ref = scvi.model.SCANVI.from_scvi_model(
+    scvi_reference_model,
+    unlabeled_category=par["unknown_celltype"],
+    labels_key=par["reference_obs_label"],
+    )
+
+reference_plan_kwargs = {"lr": par["reference_learning_rate"],
+                         "reduce_lr_on_plateau": par['reference_reduce_lr_on_plateau'],
+                         "lr_patience": par['reference_lr_patience'],
+                         "lr_factor": par['reference_lr_factor']
+                        }
+
+logger.info("Training scANVI model on reference data with celltype labels")
+
+scanvi_ref.train(
+    train_size=par["reference_train_size"],
+    max_epochs=par['reference_max_epochs'],
+    early_stopping=par['reference_early_stopping'],
+    early_stopping_patience=par['reference_early_stopping_patience'],
+    plan_kwargs=reference_plan_kwargs,
+    check_val_every_n_epoch=1,
+    accelerator="auto",
+)
+
+logger.info("Updating and training scANVI model with query data")
+scvi.model.SCANVI.prepare_query_anndata(query, scanvi_ref, inplace=True)
+scanvi_query = scvi.model.SCANVI.load_query_data(query, scanvi_ref)
+
+query_plan_kwargs = {"lr": par["query_learning_rate"],
+                     "reduce_lr_on_plateau": par['query_reduce_lr_on_plateau'],
+                     "lr_patience": par['query_lr_patience'],
+                     "lr_factor": par['query_lr_factor']
+                    }
+
+scanvi_query.train(
+    train_size=par["query_train_size"],
+    max_epochs=par['query_max_epochs'],
+    early_stopping=par['query_early_stopping'],
+    early_stopping_patience=par['query_early_stopping_patience'],
+    plan_kwargs=query_plan_kwargs,
+    check_val_every_n_epoch=1,
+    accelerator="auto",
+)
+
+logger.info("Adding latent representation to query data")
+query.obsm[par["output_obsm_scanvi_embedding"]] = scanvi_query.get_latent_representation()
+
+logger.info("Running predictions on query data")
+query.obs[par["output_obs_predictions"]] = scanvi_query.predict(query)
+query.obs[par["output_obs_probability"]] = np.max(scanvi_query.predict(query, soft=True), axis=1)
+
+logger.info("Saving output and model")
+input_data.mod[par["modality"]] = query
+input_data.write_h5mu(par["output"], compression=par["output_compression"])
+
+if par["output_model"]:
+    scanvi_query.save(par["output_model"], overwrite=True)
--- a/src/annotate/scanvi/test.py
+++ b/src/annotate/scanvi/test.py
@@ -0,0 +1,142 @@
+import sys
+import os
+import pytest
+import re
+import mudata as mu
+import anndata as ad
+from openpipelinetestutils.asserters import assert_annotation_objects_equal
+import scvi
+import os
+## VIASH START
+meta = {
+    "resources_dir": "resources_test"
+}
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
+
+@pytest.fixture
+def create_scvi_model(random_path, tmp_path):
+    def wrapper(input_file, reference_file):
+        input_data = mu.read_h5mu(input_file)
+        input_modality = input_data.mod["rna"]
+        reference_data = mu.read_h5mu(reference_file)
+        reference_modality = reference_data.mod["rna"]
+        
+        reference_data.var["gene_symbol"] = list(reference_data.var.index)
+        reference_data.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_data.var["ensemblid"]]
+        reference_modality.var["gene_symbol"] = list(reference_modality.var.index)
+        reference_modality.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_modality.var["ensemblid"]]
+
+        common_ens_ids = list(set(reference_modality.var.index).intersection(set(input_modality.var.index)))
+
+        reference = reference_modality[:, common_ens_ids].copy()
+        query = input_modality[:, common_ens_ids].copy()
+
+        scvi.model.SCVI.setup_anndata(reference,
+                                    labels_key="cell_ontology_class"
+                                    )
+
+        scvi_model = scvi.model.SCVI(
+            reference,
+            use_layer_norm="both",
+            use_batch_norm="none",
+            encode_covariates=True,
+            dropout_rate=0.2,
+            n_layers=1,
+            )
+        scvi_model.train(max_epochs=10)
+        
+        input_data.mod["rna"] = query
+        reference_data.mod["rna"] = reference
+        
+        input_data_file = random_path(extension="h5mu")
+        reference_file = random_path(extension="h5mu")
+        scvi_model_file = tmp_path
+        
+        input_data.write_h5mu(input_data_file)
+        reference_data.write_h5mu(reference_file)
+        scvi_model.save(scvi_model_file, overwrite=True)
+                
+        return scvi_model_file, input_data_file, reference_file
+    return wrapper
+
+def test_simple_execution(run_component, random_h5mu_path, create_scvi_model):
+    scvi_model_file, input_file_scvi, reference_file_scvi = create_scvi_model(input_file, reference_file)
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", input_file_scvi,
+        "--reference", reference_file_scvi,
+        "--scvi_reference_model", scvi_model_file,
+        "--reference_obs_label", "cell_ontology_class",
+        "--reference_max_epochs", "10",
+        "--query_max_epochs", "10",
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file_scvi)
+    output_mudata = mu.read_h5mu(output_file)
+    
+    assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
+    assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
+    assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
+    assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added"
+    assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added"
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+    
+def test_multiple_arguments(run_component, random_h5mu_path, create_scvi_model, tmp_path):
+    scvi_model_file, input_file_scvi, reference_file_scvi = create_scvi_model(input_file, reference_file)
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", input_file_scvi,
+        "--reference", reference_file_scvi,
+        "--scvi_reference_model", scvi_model_file,
+        "--reference_obs_label", "cell_ontology_class",
+        "--output", output_file,
+        "--reference_max_epochs", "10",
+        "--reference_reduce_lr_on_plateau", "True",
+        "--reference_lr_patience", "5",
+        "--reference_lr_factor", "0.5",
+        "--reference_train_size", "0.8",
+        "--reference_early_stopping", "True",
+        "--reference_early_stopping_patience", "5",
+        "--reference_early_stopping_min_delta", "0.01",
+        "--query_max_epochs", "10",
+        "--query_reduce_lr_on_plateau", "True",
+        "--query_lr_patience", "5",
+        "--query_lr_factor", "0.5",
+        "--query_train_size", "0.8",
+        "--query_early_stopping", "True",
+        "--query_early_stopping_patience", "5",
+        "--query_early_stopping_min_delta", "0.01",
+        "--output_obs_predictions", "scanvi_pred",
+        "--output_obs_probabilities", "scanvi_probabilitity",
+        "--output_compression", "gzip",
+        "--output_model", tmp_path
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+    assert os.path.exists(tmp_path / "model.pt"), "Model file does not exist" 
+
+    input_mudata = mu.read_h5mu(input_file_scvi)
+    output_mudata = mu.read_h5mu(output_file)
+    
+    assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
+    assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
+    assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
+    assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added"
+    assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added"
+
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/svm_annotation/config.vsh.yaml
+++ b/src/annotate/svm_annotation/config.vsh.yaml
@@ -0,0 +1,126 @@
+name: svm_annotation
+namespace: annotate
+description: Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: The layer in the input data to be used for cell type annotation if .X is not to be used. 
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description:
+        required: true
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_compression"
+        type: string
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+      - name: "--output_obs_prediction"
+        type: string
+        default: svm_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: svm_probability
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+  - name: Model arguments
+    description: Model arguments.
+    arguments:
+      - name: "--model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+      - name: "--feature_selection"
+        type: boolean
+        description: "Whether to perform feature selection."
+        default: true
+      - name: "--max_iter"
+        type: integer
+        description: "Maximum number of iterations for the SVM."
+        min: 1
+        default: 5000
+      - name: "--c_reg"
+        type: double
+        description: "Regularization parameter for the SVM."
+        min: 0.0
+        default: 1.0
+      - name: "--class_weight"
+        type: string
+        description: |
+          "Class weights for the SVM.  The `uniform` mode gives all classes a weight  of one. 
+          The `balanced` mode (default) uses the values of y to automatically adjust weights inversely 
+          proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))"
+        choices: ["balanced", "uniform"]
+        default: "balanced"
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - libhdf5-dev
+          - procps
+      - type: python
+        packages:
+          - scikit-learn==1.5.2
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable 
+  - type: nextflow
--- a/src/annotate/svm_annotation/script.py
+++ b/src/annotate/svm_annotation/script.py
@@ -0,0 +1,94 @@
+import sys
+import logging
+import mudata as mu
+import numpy as np
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn import svm
+import pickle
+import re
+
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    "output": "output.h5mu",
+    "modality": "rna",
+    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    "model": None,
+    "reference_obs_target": "cell_ontology_class",
+    "input_layer": None,
+    "reference_layer": None,
+    "max_iter": 5000,
+    "c_reg": 1,
+    "class_weight": "balanced",
+    "output_compression": "gzip",
+    "var_query_gene_names": None,
+    "var_reference_gene_names": "ensemblid",
+    "reference_layer": None,
+    "output_obs_prediction": "svm_pred",
+    "output_obs_probability": "svm_probability",
+}
+meta = {"resources_dir": "src/annotate/svm"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+def setup_logger():
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(sys.stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+
+logger = setup_logger()
+
+def main():
+    
+    if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]):
+        raise ValueError("Make sure to provide either 'model' or 'reference', but not both.")
+    
+    logger.info("Reading input data")
+    input_mudata = mu.read_h5mu(par["input"])
+    input_modality = input_mudata.mod[par["modality"]].copy()
+    
+    input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X 
+    
+    if par["model"]:
+        logger.info("Loading a pre-trained model")
+        model = pickle.load(open(par["model"], "rb"))
+
+    elif par["reference"]:
+        logger.info("Reading reference data")
+        
+        reference_mudata = mu.read_h5mu(par["reference"])
+        reference_modality = reference_mudata.mod[par["modality"]].copy()
+    
+        reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X
+
+        logger.info("Training a model...")
+        labels = reference_modality.obs[par["reference_obs_target"]].to_numpy()
+        model = CalibratedClassifierCV(svm.LinearSVC(
+            C=par["c_reg"],
+            max_iter=par["max_iter"],
+            class_weight=par["class_weight"] if not par["class_weight"]=="uniform" else None,
+            dual="auto",
+        ))
+        model.fit(reference_matrix, labels)
+    
+    logger.info("Running predictions...")
+    predictions = model.predict(input_matrix)
+    probabilities = np.max(model.predict_proba(input_matrix), axis=1)
+    
+    input_modality.obs[par["output_obs_prediction"]] = predictions
+    input_modality.obs[par["output_obs_probability"]] = probabilities
+
+    logger.info("Writing output data")
+    input_mudata.mod[par["modality"]] = input_modality
+    input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
+
+if __name__ == "__main__":
+    main()
--- a/src/annotate/svm_annotation/test.py
+++ b/src/annotate/svm_annotation/test.py
@@ -0,0 +1,164 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+from openpipelinetestutils.asserters import assert_annotation_objects_equal
+import os
+from sklearn import svm
+from sklearn.calibration import CalibratedClassifierCV
+import pickle
+
+## VIASH START
+meta = {
+    "resources_dir": "resources_test"
+}
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
+# model_file = f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model"
+
+@pytest.fixture
+def subset_genes(random_h5mu_path):
+    def wrapper(input_mudata_file, reference_mudata_file, modality):
+        input_mudata = mu.read_h5mu(input_mudata_file)
+        input_adata = input_mudata.mod[modality]
+        reference_mudata = mu.read_h5mu(reference_mudata_file)
+        reference_adata = reference_mudata.mod[modality]
+        
+        reference_mudata.var["gene_symbol"] = list(reference_mudata.var.index)
+        reference_mudata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_mudata.var["ensemblid"]]
+        reference_adata.var["gene_symbol"] = list(reference_adata.var.index)
+        reference_adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_adata.var["ensemblid"]]
+        common_ens_ids = list(set(reference_adata.var.index).intersection(set(input_adata.var.index)))
+        
+        reference = reference_adata[:, common_ens_ids].copy()
+        query = input_adata[:, common_ens_ids].copy()
+        
+        input_mudata.mod[modality] = query
+        reference_mudata.mod[modality] = reference
+        
+        subset_input_mudata_file = random_h5mu_path()
+        subset_reference_mudata_file = random_h5mu_path()
+        
+        input_mudata.write_h5mu(subset_input_mudata_file)
+        reference_mudata.write_h5mu(subset_reference_mudata_file)
+        return subset_input_mudata_file, subset_reference_mudata_file
+    return wrapper
+
+@pytest.fixture
+def dummy_model(tmp_path, subset_genes):
+    _, subset_reference_file = subset_genes(input_file, reference_file, "rna")
+    reference_modality = mu.read_h5mu(subset_reference_file).mod["rna"]
+    
+    labels = reference_modality.obs["cell_ontology_class"].to_numpy()
+    model = CalibratedClassifierCV(svm.LinearSVC(
+        max_iter=10,
+        dual="auto",
+    ))
+    model.fit(reference_modality.X, labels)
+    
+    model_path = tmp_path / "model.pkl"
+    with open(model_path, "wb") as f:
+        pickle.dump(model, f)
+        
+    return model_path
+
+def test_simple_execution(run_component, random_h5mu_path, subset_genes):
+    subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", subset_input_file,
+        "--reference", subset_reference_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ['svm_pred',
+                                                         'svm_probability']
+
+    obs_values = output_mudata.mod["rna"].obs["svm_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
+
+
+def test_custom_out_obs_model_params(run_component, random_h5mu_path, subset_genes):
+    subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", subset_input_file,
+        "--reference", subset_reference_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--output_obs_prediction", "dummy_pred",
+        "--output_obs_probability", "dummy_probability",
+        "--max_iter", "1000",
+        "--c_reg", "0.1",
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ['dummy_pred',
+                                                         'dummy_probability']
+
+    obs_values = output_mudata.mod["rna"].obs["dummy_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
+
+
+def test_with_model(run_component, random_h5mu_path, dummy_model, subset_genes):
+    subset_input_file, _ = subset_genes(input_file, reference_file, "rna")
+    output_file = random_h5mu_path()
+
+    run_component([
+        "--input", subset_input_file,
+        "--reference_obs_target", "cell_ontology_class",
+        "--model", dummy_model,
+        "--output", output_file
+    ])
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"],
+                                    output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ['svm_pred',
+                                                         'svm_probability']
+
+    obs_values = output_mudata.mod["rna"].obs["svm_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
+
+def test_no_model_no_reference_error(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component([
+            "--input", input_file,
+            "--reference_obs_target", "cell_ontology_class",
+            "--output", output_file,
+        ])
+    assert re.search(r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
+            err.value.stdout.decode('utf-8'))
+
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/authors/angela_pisco.yaml
+++ b/src/authors/angela_pisco.yaml
@@ -0,0 +1,14 @@
+name: Angela Oliveira Pisco
+info:
+  role: Contributor
+  links:
+    github: aopisco
+    orcid: "0000-0003-0142-2355"
+    linkedin: aopisco
+  organizations:
+    - name: Insitro
+      href: https://insitro.com
+      role: Director of Computational Biology
+    - name: Open Problems
+      href: https://openproblems.bio
+      role: Core Member
--- a/src/authors/dorien_roosen.yaml
+++ b/src/authors/dorien_roosen.yaml
@@ -0,0 +1,11 @@
+name: Dorien Roosen
+info:
+  role: Core Team Member
+  links:
+    email: dorien@data-intuitive.com
+    github: dorien-er
+    linkedin: dorien-roosen
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Data Scientist
--- a/src/authors/dries_de_maeyer.yaml
+++ b/src/authors/dries_de_maeyer.yaml
@@ -0,0 +1,11 @@
+name: Dries De Maeyer
+info:
+  role: Core Team Member
+  links:
+    email: ddemaeyer@gmail.com
+    github: ddemaeyer
+    linkedin: dries-de-maeyer-b46a814
+  organizations:
+    - name: Janssen Pharmaceuticals
+      href: https://www.janssen.com
+      role: Principal Scientist
--- a/src/authors/dries_schaumont.yaml
+++ b/src/authors/dries_schaumont.yaml
@@ -0,0 +1,12 @@
+name: Dries Schaumont
+info:
+  role: Core Team Member
+  links:
+    email: dries@data-intuitive.com
+    github: DriesSchaumont
+    orcid: "0000-0002-4389-0440"
+    linkedin: dries-schaumont
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Data Scientist
--- a/src/authors/elizabeth_mlynarski.yaml
+++ b/src/authors/elizabeth_mlynarski.yaml
@@ -0,0 +1,6 @@
+name: Elizabeth Mlynarski
+info:
+  role: Contributor
+  organizations:
+    - name: Janssen R&D US
+      role: Principal Scientist Computational Genomics
--- a/src/authors/isabelle_bergiers.yaml
+++ b/src/authors/isabelle_bergiers.yaml
@@ -0,0 +1,10 @@
+name: Isabelle Bergiers 
+info:
+  role: Contributor
+  links:
+    github: Isabelle-b
+    orcid: 0000-0001-9622-7960
+  organizations:
+    - name: Janssen Pharmaceuticals
+      href: https://www.janssen.com
+      role: Scientist OMICS Technology
--- a/src/authors/jakub_majercik.yaml
+++ b/src/authors/jakub_majercik.yaml
@@ -0,0 +1,11 @@
+name: Jakub Majercik
+info:
+  role: Contributor
+  links:
+    email: jakub@data-intuitive.com
+    github: jakubmajercik
+    linkedin: jakubmajercik
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Bioinformatics Engineer
--- a/src/authors/kai_waldrant.yaml
+++ b/src/authors/kai_waldrant.yaml
@@ -0,0 +1,15 @@
+name: Kai Waldrant
+info:
+  role: Contributor
+  links:
+    email: kai@data-intuitive.com
+    github: KaiWaldrant
+    orcid: "0009-0003-8555-1361"
+    linkedin: kaiwaldrant
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Bioinformatician
+    - name: Open Problems
+      href: https://openproblems.bio
+      role: Contributor
--- a/src/authors/malte_luecken.yaml
+++ b/src/authors/malte_luecken.yaml
@@ -0,0 +1,16 @@
+name: Malte D. Luecken
+info:
+  role: Core Team Member
+  links:
+    email: malte.luecken@helmholtz-muenchen.de
+    github: LuckyMD
+    orcid: "0000-0001-7464-7921"
+    linkedin: malte-l%C3%BCcken-b8b21049
+    twitter: MDLuecken
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: Group Leader
+    - name: Open Problems
+      href: https://openproblems.bio
+      role: Core Member
--- a/src/authors/marijke_van_moerbeke.yaml
+++ b/src/authors/marijke_van_moerbeke.yaml
@@ -0,0 +1,11 @@
+name: Marijke Van Moerbeke
+info:
+  role: Contributor
+  links:
+    github: mvanmoerbeke
+    orcid: 0000-0002-3097-5621
+    linkedin: marijke-van-moerbeke-84303a34
+  organizations:
+    - name: OpenAnalytics
+      href: https://www.openanalytics.eu
+      role: Statistical Consultant
--- a/src/authors/matthias_beyens.yaml
+++ b/src/authors/matthias_beyens.yaml
@@ -0,0 +1,12 @@
+name: Matthias Beyens
+info:
+  role: Contributor
+  links:
+    github: MatthiasBeyens
+    orcid: "0000-0003-3304-0706"
+    email: matthias.beyens@gmail.com
+    linkedin: mbeyens
+  organizations:
+    - name: Janssen Pharmaceuticals
+      href: https://www.janssen.com
+      role: Principal Scientist
--- a/src/authors/mauro_saporita.yaml
+++ b/src/authors/mauro_saporita.yaml
@@ -0,0 +1,11 @@
+name: Mauro Saporita
+info:
+  role: Contributor
+  links:
+    email: maurosaporita@gmail.com
+    github: mauro-saporita
+    linkedin: mauro-saporita-930b06a5
+  organizations:
+    - name: Ardigen
+      href: https://ardigen.com
+      role: Lead Nextflow Developer
--- a/src/authors/povilas_gibas.yaml
+++ b/src/authors/povilas_gibas.yaml
@@ -0,0 +1,11 @@
+name: Povilas Gibas
+info:
+  role: Contributor
+  links:
+    email: povilasgibas@gmail.com
+    github: PoGibas
+    linkedin: povilas-gibas
+  organizations:
+    - name: Ardigen
+      href: https://ardigen.com
+      role: Bioinformatician
--- a/src/authors/robrecht_cannoodt.yaml
+++ b/src/authors/robrecht_cannoodt.yaml
@@ -0,0 +1,15 @@
+name: Robrecht Cannoodt
+info:
+  role: Core Team Member
+  links:
+    email: robrecht@data-intuitive.com
+    github: rcannood
+    orcid: "0000-0003-3641-729X"
+    linkedin: robrechtcannoodt
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Data Science Engineer
+    - name: Open Problems
+      href: https://openproblems.bio
+      role: Core Member
--- a/src/authors/samuel_d_souza.yaml
+++ b/src/authors/samuel_d_souza.yaml
@@ -0,0 +1,10 @@
+name: Samuel D'Souza
+info:
+  role: Contributor
+  links:
+    github: srdsam
+    linkedin: samuel-d-souza-887023150/
+  organizations:
+    - name: Chan Zuckerberg Biohub
+      href: https://www.czbiohub.org
+      role: Data Engineer
--- a/src/authors/sarah_ouologuem.yaml
+++ b/src/authors/sarah_ouologuem.yaml
@@ -0,0 +1,10 @@
+name: Sarah Ouologuem
+info:
+  role: Contributor
+  links:
+    github: SarahOuologuem
+    orcid: 0009-0005-3398-1700
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: Student Assistant
--- a/src/authors/toni_verbeiren.yaml
+++ b/src/authors/toni_verbeiren.yaml
@@ -0,0 +1,10 @@
+name: Toni Verbeiren
+info:
+  role: Core Team Member
+  links:
+    github: tverbeiren
+    linkedin: verbeiren
+  organizations:
+  - name: Data Intuitive
+    href: https://www.data-intuitive.com
+    role: Data Scientist and CEO
--- a/src/authors/vladimir_shitov.yaml
+++ b/src/authors/vladimir_shitov.yaml
@@ -0,0 +1,12 @@
+name: Vladimir Shitov
+info:
+  role: Contributor
+  links:
+    email: vladimir.shitov@helmholtz-muenchen.de
+    github: vladimirshitov
+    orcid: "0000-0002-1960-8812"
+    linkedin: vladimir-shitov-9a659513b
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: PhD Candidate
--- a/src/authors/weiwei_schultz.yaml
+++ b/src/authors/weiwei_schultz.yaml
@@ -0,0 +1,6 @@
+name: Weiwei Schultz
+info:
+  role: Contributor
+  organizations:
+    - name: Janssen R&D US
+      role: Associate Director Data Sciences
--- a/src/authors/xichen_wu.yaml
+++ b/src/authors/xichen_wu.yaml
@@ -0,0 +1,11 @@
+name: Xichen Wu
+info:
+  role: Contributor
+  links:
+    github: wxicu
+    linkedin: xichen-wu
+    orcid: 0009-0008-2168-4508
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: Student Assistant
--- a/src/base/openpipelinetestutils/.gitignore
+++ b/src/base/openpipelinetestutils/.gitignore
@@ -0,0 +1,5 @@
+__pycache__/
+build
+eggs/
+*.egg
+*.egg-info/
--- a/src/base/openpipelinetestutils/init.py
+++ b/src/base/openpipelinetestutils/init.py
--- a/src/base/openpipelinetestutils/asserters.py
+++ b/src/base/openpipelinetestutils/asserters.py
@@ -0,0 +1,240 @@
+import mudata
+import anndata
+import pandas as pd
+import numpy as np
+from scipy.sparse import issparse, spmatrix
+from mudata import MuData
+from pathlib import Path
+from pandas.testing import assert_frame_equal
+from typing import Literal
+from .typing import AnnotationObjectOrPathLike
+from functools import singledispatch
+
+
+def _read_if_needed(anndata_mudata_path_or_obj):
+    if isinstance(anndata_mudata_path_or_obj, (str, Path)):
+        return mudata.read(str(anndata_mudata_path_or_obj)) # TODO: remove when mudata fixes PAth bug
+    if isinstance(anndata_mudata_path_or_obj, (mudata.MuData, anndata.AnnData)):
+        return anndata_mudata_path_or_obj.copy()
+    raise AssertionError("Expected 'Path', 'str' to MuData/AnnData "
+                         "file or MuData/AnnData object.")
+
+def _assert_same_annotation_object_class(left, right):
+    assert type(left) == type(right), (f"Two objects are not of the same class:"
+                                       f"\n[Left]:{type(left)}\n[right]:{type(right)}")
+    
+def _promote_dtypes(left, right):
+    # Create new DataFrames to avoid modifying the original ones
+    left_aligned = left.copy()
+    right_aligned = right.copy()
+    
+    for column in left.columns:
+        l_dtype = left[column].dtype
+        r_dtype = right[column].dtype
+        
+        if l_dtype == r_dtype:
+            # No need to modify dtypes that are already the same
+            continue
+        if not all(map(pd.api.types.is_any_real_numeric_dtype, (r_dtype, l_dtype))):
+            # Do not try casting without dtypes that do not represent real numbers
+            continue
+        is_extension = pd.api.types.is_extension_array_dtype(l_dtype)
+        if is_extension and not pd.api.types.is_extension_array_dtype(r_dtype):
+            continue
+        numpy_dtype_l = l_dtype.type if is_extension else l_dtype
+        numpy_dtype_r = r_dtype.type if is_extension else r_dtype 
+        # At this point we should have only integer or float dtypes 
+        common_dtype = np.promote_types(numpy_dtype_l, numpy_dtype_r)
+        if is_extension:
+            left_aligned[column] = pd.array(left[column], dtype=common_dtype)
+            right_aligned[column] = pd.array(right[column], dtype=common_dtype)
+        else:
+            left_aligned[column] = left[column].astype(common_dtype)
+            right_aligned[column] = right[column].astype(common_dtype)
+    
+    return left_aligned, right_aligned
+
+
+def assert_mudata_modality_keys_equal(left, right):
+    left_keys = set(left.mod.keys())
+    right_keys = set(right.mod.keys())
+    if left_keys!= right_keys:
+        raise AssertionError("MuData modalities differ:"
+                             f"\n[left]:{left_keys}\n[right]:{right_keys}")
+
+def assert_shape_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike):
+    left, right = _read_if_needed(left), _read_if_needed(right)
+    _assert_same_annotation_object_class(left, right)
+    if left.shape != right.shape:
+        raise AssertionError(f"{type(left).__name__} shapes differ:"
+                             f"\n[left]:{left.shape}\n[right]:{right.shape}")
+    if isinstance(left, MuData):
+        assert_mudata_modality_keys_equal(left, right)
+        for mod_name, modality in left.mod.items(): 
+            assert_shape_equal(modality, right[mod_name])
+ 
+
+def assert_obs_names_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike, 
+                           *args, **kwargs):
+    left, right = _read_if_needed(left), _read_if_needed(right)
+    _assert_same_annotation_object_class(left, right)
+    pd.testing.assert_index_equal(left.obs_names, right.obs_names, *args, **kwargs)
+    if isinstance(left, MuData):
+        assert_mudata_modality_keys_equal(left, right)
+        for mod_name, modality in left.mod.items(): 
+            assert_obs_names_equal(modality, right[mod_name])
+
+
+def assert_var_names_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike, 
+                           *args, **kwargs):
+    left, right = _read_if_needed(left), _read_if_needed(right)
+    _assert_same_annotation_object_class(left, right)
+    pd.testing.assert_index_equal(left.var_names, right.var_names, *args, **kwargs)
+    if isinstance(left, MuData):
+        assert_mudata_modality_keys_equal(left, right)
+        for mod_name, modality in left.mod.items(): 
+            assert_var_names_equal(modality, right[mod_name])
+
+
+def _assert_frame_equal(left, right, sort=False, promote_precicion=False, *args, **kwargs):
+    if sort:
+        left, right = left.sort_index(inplace=False), right.sort_index(inplace=False)
+        left, right = left.sort_index(axis=1, inplace=False), right.sort_index(axis=1, inplace=False)
+        
+    if promote_precicion:
+        left, right = _promote_dtypes(left, right)
+        assert_frame_equal(left, right, check_exact=False, atol=1e-3, *args, **kwargs)
+    else:
+        assert_frame_equal(left, right, *args, **kwargs)
+
+def assert_annotation_frame_equal(annotation_attr: Literal["obs", "var"], 
+                                   left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike, 
+                                   sort=False,
+                                   promote_precicion=False,
+                                   *args, **kwargs):
+    if not annotation_attr in ("obs", "var"):
+        raise ValueError("annotation_attr should be 'obs', or 'var'")
+    left, right = _read_if_needed(left), _read_if_needed(right)
+    _assert_same_annotation_object_class(left, right)
+    left_frame, right_frame = getattr(left, annotation_attr), getattr(right, annotation_attr)
+    _assert_frame_equal(left_frame, right_frame, sort=sort, promote_precicion=promote_precicion, *args, **kwargs)
+    if isinstance(left, MuData):
+        assert_mudata_modality_keys_equal(left, right)
+        for mod_name, modality in left.mod.items(): 
+            assert_annotation_frame_equal(annotation_attr, modality, 
+                                           right[mod_name], sort=sort, promote_precicion=promote_precicion, *args, **kwargs)
+
+def _assert_layer_equal(left, right):
+    if issparse(left):
+        if not issparse(right):
+           raise AssertionError("Layers differ:\n[left]: sparse\n[right]: not sparse")
+        if left.getformat() != right.getformat():
+            raise AssertionError("Layers format differ:"
+                                 f"\n[left]:{left.getformat()}\n[right]: {right.getformat()}")
+        assert np.all(left.indices == right.indices), "Layers differ: indices are not the same"
+        assert np.all(left.indptr == right.indptr), "Layers differ: index pointers are not the same"
+        np.testing.assert_allclose(left.data, right.data, rtol=1e-5,
+                                  err_msg="Layers data differs.", equal_nan=True)
+    else:
+        if issparse(right):
+            raise AssertionError("Layers differ:\n[left]: not sparse\n[right]: sparse")
+        np.testing.assert_allclose(left, right, 
+                                   rtol=1e-5,
+                                   err_msg="Layers data differs.",
+                                   equal_nan=True)
+        
+
+def assert_layers_equal(left: AnnotationObjectOrPathLike,
+                        right: AnnotationObjectOrPathLike):
+    left, right = _read_if_needed(left), _read_if_needed(right)
+    _assert_same_annotation_object_class(left, right)
+    if left.raw is not None:
+        try:
+            _assert_layer_equal(left.raw, right.raw)
+        except AssertionError as e:
+            e.add_note(".raw is different")
+            raise
+    else:
+        if right.raw:
+            raise AssertionError("Layer .raw differs: "
+                                 f"\n[left]:{left.raw}\n[right]:{right}")
+    if left.X is not None:
+        try:
+            _assert_layer_equal(left.X, right.X)
+        except AssertionError as e:
+            e.add_note("X is different.")
+            raise
+    if left.layers:
+        assert right.layers and (left.layers.keys() == right.layers.keys()), \
+        "Avaiable layers differ:" \
+        f"\n[left]:{left.layers}\n[right]{right.layers}"
+        for layer_name, layer in left.layers.items():
+            try:
+                _assert_layer_equal(layer, right.layers[layer_name])
+            except AssertionError as e:
+                e.add_note(f"Layer {layer_name} is different")
+                raise
+    if isinstance(left, MuData):
+        assert_mudata_modality_keys_equal(left, right)
+        for mod_name, modality in left.mod.items(): 
+            assert_layers_equal(modality, right[mod_name])
+
+
+
+def assert_multidimensional_annotation_equal(annotation_attr: Literal["obsm", "varm"],
+                                             left, right, sort=False):
+    if not annotation_attr in ("obsm", "varm"):
+        raise ValueError("annotation_attr should be 'obsm', or 'varm'")
+    left, right = _read_if_needed(left), _read_if_needed(right)
+    _assert_same_annotation_object_class(left, right)
+
+    @singledispatch
+    def _assert_multidimensional_value_equal(left, right, **kwargs):
+        raise NotImplementedError("Unregistered type found while asserting")
+    
+    @_assert_multidimensional_value_equal.register
+    def _(left: pd.DataFrame, right, **kwargs):
+        _assert_frame_equal(left, right, **kwargs)
+   
+    @_assert_multidimensional_value_equal.register(np.ndarray)
+    @_assert_multidimensional_value_equal.register(spmatrix)
+    def _(left, right, **kwargs):
+        # Cannot sort sparse and dense matrices so ignore sort param
+        _assert_layer_equal(left, right)
+
+    left_dict, right_dict = getattr(left, annotation_attr), getattr(right, annotation_attr)
+    left_keys, right_keys = left_dict.keys(), right_dict.keys()
+    assert left_keys == right_keys, f"Keys of {annotation_attr} differ:\n[left]:{left_keys}\n[right]:{right_keys}"
+    for left_key, left_value in left_dict.items():
+        try:
+            _assert_multidimensional_value_equal(left_value, right_dict[left_key], sort=sort)
+        except AssertionError as e:
+            e.add_note(f"Failing key: {left_key}")
+            raise
+    if isinstance(left, MuData):
+        assert_mudata_modality_keys_equal(left, right)
+        for mod_name, modality in left.mod.items(): 
+            try:
+                assert_multidimensional_annotation_equal(annotation_attr ,modality, right[mod_name], sort=sort)
+            except AssertionError as e:
+                e.add_note(f"Failing modality: {mod_name}")
+                raise
+
+def assert_annotation_objects_equal(left: AnnotationObjectOrPathLike,
+                                    right: AnnotationObjectOrPathLike,
+                                    check_data=True,
+                                    sort=True,
+                                    promote_precision=False):
+    left, right = _read_if_needed(left), _read_if_needed(right)
+    _assert_same_annotation_object_class(left, right)
+    assert_shape_equal(left, right)
+    assert_annotation_frame_equal("obs", left, right, sort=sort, promote_precicion=promote_precision)
+    assert_annotation_frame_equal("var", left, right, sort=sort, promote_precicion=promote_precision)
+    for slot in ("varm", "obsm"):
+        try:
+            assert_multidimensional_annotation_equal(slot, left, right, sort=sort)
+        except AssertionError as e:
+            e.add_note(f"Failing multidimensional slot: {slot}")
+            raise
+    if check_data:
+        assert_layers_equal(left, right)
--- a/src/base/openpipelinetestutils/conftest.py
+++ b/src/base/openpipelinetestutils/conftest.py
@@ -0,0 +1,13 @@
+import importlib
+import pytest
+from pathlib import Path
+
+def pytest_collect_file(file_path: Path, parent):
+    if (file_path.name == ".viash_script.sh"):
+        # Allow file ending in .sh to be imported
+        importlib.machinery.SOURCE_SUFFIXES.append('.viash_script.sh')
+        return pytest.Module.from_parent(parent, path=file_path)
+
+
+def pytest_collection_finish(session):
+   importlib.machinery.SOURCE_SUFFIXES.remove('.viash_script.sh')
--- a/src/base/openpipelinetestutils/fixtures.py
+++ b/src/base/openpipelinetestutils/fixtures.py
@@ -0,0 +1,63 @@
+from uuid import uuid4
+import pytest
+import pandas as pd
+import anndata as ad
+import mudata as md
+
+@pytest.fixture
+def random_path(tmp_path):
+    def wrapper(extension=None):
+        extension = "" if not extension else f".{extension}"
+        return tmp_path / f"{uuid4()}{extension}"
+    return wrapper 
+
+@pytest.fixture
+def random_h5mu_path(random_path):
+    def wrapper():
+        return random_path(extension="h5mu")
+    return wrapper
+
+@pytest.fixture
+def write_mudata_to_file(random_h5mu_path):
+    def wrapper(mudata_obj):
+        output_path = random_h5mu_path()
+        mudata_obj.write(output_path)
+        return output_path
+    return wrapper
+
+@pytest.fixture
+def small_anndata_1():
+    df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"])
+    obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"])
+    var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"])
+    ad1 = ad.AnnData(df, obs=obs, var=var)
+    return ad1
+
+@pytest.fixture
+def small_anndata_2():
+    df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var4", "var5", "var6"])
+    obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"])
+    var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"])
+    ad2 = ad.AnnData(df, obs=obs2, var=var2)
+    return ad2
+
+@pytest.fixture
+def small_mudata(small_anndata_1, small_anndata_2):
+    return md.MuData({'mod1': small_anndata_1, 'mod2': small_anndata_2})
+
+@pytest.fixture
+def small_mudata_path(small_mudata, write_mudata_to_file):
+    return write_mudata_to_file(small_mudata)
+
+@pytest.fixture
+def split_small_mudata_path(small_mudata_mod1_path, small_mudata_mod2_path):
+    return small_mudata_mod1_path, small_mudata_mod2_path
+
+@pytest.fixture
+def small_mudata_mod1_path(small_mudata, write_mudata_to_file):
+    return write_mudata_to_file(md.MuData({'mod1': small_mudata.mod['mod1']}))
+
+@pytest.fixture
+def small_mudata_mod2_path(small_mudata, write_mudata_to_file):
+    return write_mudata_to_file(md.MuData({'mod2': small_mudata.mod['mod2']}))
+
--- a/src/base/openpipelinetestutils/pyproject.toml
+++ b/src/base/openpipelinetestutils/pyproject.toml
@@ -0,0 +1,5 @@
+[build-system]
+requires = [
+  "setuptools >= 40.9.0",
+]
+build-backend = "setuptools.build_meta"
--- a/src/base/openpipelinetestutils/setup.cfg
+++ b/src/base/openpipelinetestutils/setup.cfg
@@ -0,0 +1,22 @@
+[metadata]
+name = openpipelinetestutils
+author = Dries Schaumont
+author_email = dries@data-intuitive.com
+maintainer = Dries Schaumont
+maintainer_email = dries@data-intuitive.com
+description = Various test utilities for openpipeline.
+license = MIT
+
+[options]
+python_requires = >=3.8
+install_requires =
+    pytest >= 6.2
+    mudata~=0.2.3
+    pandas!=2.1.2
+    anndata~=0.9.1
+package_dir=
+    openpipelinetestutils = .
+
+[options.entry_points]
+pytest11 =
+    openpipelineutils = openpipelinetestutils.fixtures
--- a/src/base/openpipelinetestutils/typing.py
+++ b/src/base/openpipelinetestutils/typing.py
@@ -0,0 +1,7 @@
+from typing import Union
+from mudata import MuData
+from anndata import AnnData
+from pathlib import Path
+
+AnnotationObject = Union[MuData, AnnData]
+AnnotationObjectOrPathLike = Union[AnnotationObject, str, Path]
--- a/src/base/openpipelinetestutils/utils.py
+++ b/src/base/openpipelinetestutils/utils.py
@@ -0,0 +1,60 @@
+from .typing import AnnotationObject
+from typing import Union, Literal
+from functools import reduce
+from operator import attrgetter
+from anndata import AnnData
+from mudata import MuData
+from itertools import product
+
+
+def remove_annotation_column(annotation_object: AnnotationObject,
+                             column_names: list[str] | str,
+                             axis: Union[Literal["obs"], Literal["var"], 0, 1],
+                             modality_name: str | None = None):
+    if isinstance(annotation_object, AnnData) and modality_name is not None:
+        raise ValueError("Cannot specify modality when object is of type AnnData.")
+    if isinstance(column_names, str):
+        column_names = [str(column_names)] # str to make a copy
+    axis_strings = {
+        "var": "var",
+        "obs": "obs",
+        0: "obs",
+        1: "var"
+    }
+    axis_string = axis_strings[axis]
+    axis_getter = attrgetter(axis_string)
+    axis_setter = lambda obj, value: setattr(obj, axis_string, value)
+    if not modality_name:
+        axis_setter(annotation_object, axis_getter(annotation_object).drop(column_names,
+                                                                        axis="columns",
+                                                                        inplace=False))
+
+    def _get_columns_in_all_modalities(annotation_object, axis_string: str):
+        return reduce(
+            lambda a, b: a.intersection(b),
+            [getattr(annotation_object.mod[mod], axis_string).columns
+                for mod in annotation_object.mod],
+        ).to_list()
+
+    if isinstance(annotation_object, MuData):
+        if not annotation_object.axis == 0:
+                raise ValueError("This function was designed for mudata objects with .axis=0")
+        modality_names = [modality_name] if modality_name else list(annotation_object.mod.keys())
+        global_columns = _get_columns_in_all_modalities(annotation_object, axis_string) \
+                        if axis_string == "var" else []
+        extra_cols_to_remove = [f"{mod_name}:{column_name}" for mod_name, column_name
+                                in product(modality_names, column_names)
+                                if column_name not in global_columns]
+        extra_cols_to_remove += [column_name for column_name in column_names
+                                 if column_name in global_columns]
+        if modality_name:
+            axis_setter(annotation_object, axis_getter(annotation_object).drop(extra_cols_to_remove,
+                                                                               axis="columns",
+                                                                               inplace=False))
+
+        for mod_name in modality_names:
+            modality = annotation_object.mod[mod_name]
+            new_modality = remove_annotation_column(modality, column_names,
+                                                    axis=axis, modality_name=None)
+            annotation_object.mod[mod_name] = new_modality
+    return annotation_object
--- a/src/base/requirements/anndata.yaml
+++ b/src/base/requirements/anndata.yaml
@@ -0,0 +1,3 @@
+
+packages:
+  - anndata==0.10.8
--- a/src/base/requirements/anndata_mudata.yaml
+++ b/src/base/requirements/anndata_mudata.yaml
@@ -0,0 +1,5 @@
+__merge__: [/src/base/requirements/anndata.yaml, .]
+packages:
+  - mudata~=0.2.4
+  - pandas!=2.1.2
+  - numpy<2.0.0
--- a/src/base/requirements/python_test_setup.yaml
+++ b/src/base/requirements/python_test_setup.yaml
@@ -0,0 +1,7 @@
+test_setup:
+  - type: docker
+    copy: ["openpipelinetestutils /opt/openpipelinetestutils"]
+  - type: python
+    packages: /opt/openpipelinetestutils
+  - type: python
+    __merge__: /src/base/requirements/viashpy.yaml
--- a/src/base/requirements/scanpy.yaml
+++ b/src/base/requirements/scanpy.yaml
@@ -0,0 +1,3 @@
+
+packages:
+  - scanpy~=1.9.6
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`$ref: "defs_common.yaml#/definitions/Author"`
				`@@ -0,0 +1 @@`
				`$ref: "defs_viash.yaml#/definitions/PackageConfig"`