Build branch openpipeline/processing-workflows-quantile-filtering with version processing-workflows-quantile-filtering to openpipeline on branch processing-workflows-quantile-filtering (38f5f83d)

Build pipeline: openpipelines-bio.openpipeline.processing-workflows-quantijd49m Source commit: 38f5f83d11 Source message: fix tests
2026-06-09 18:32:44 +00:00
commit 5b8fa21d13
2535 changed files with 1322536 additions and 0 deletions
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,3 @@
+# Apply ruff as linter for python code
+9e87b864f699c371b444b592a19e610a3c9d3286
+ 
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+src/mapping/bd_rhapsody*/*.cwl linguist-generated
+src/query/cellxgene_census linguist-generated
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,41 @@
+# Jupyter notebooks
+.ipynb_checkpoints
+
+# pycache
+*__pycache__*
+.nfs*
+
+# R related
+.Rhistory
+*.Rproj
+.Rproj.user
+
+# Python virtual environments
+.venv
+
+# temporary files related
+temp
+
+# NextFlow
+work/
+.nextflow.log
+flowchart.*
+.nextflow*
+out/ 
+
+# Macos
+.DS_Store
+
+# viash related
+.viash_log*
+log.txt
+check_results/
+out/
+output*
+output_log/
+resources_test
+/viash_tools/
+
+# vscode
+.vscode/launch.json
+.vscode/settings.json
--- a/.lintr
+++ b/.lintr
@@ -0,0 +1,3 @@
+exclusions: list(
+    "README.qmd"
+    )
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,24 @@
+
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.14.0
+  hooks:
+    - id: ruff-check
+      args: [ --fix ]
+    - id: ruff-format
+- repo: local
+  hooks:
+  - id: run_styler
+    name: run_styler
+    language: r
+    description: style files with {styler}
+    entry: "Rscript -e 'styler::style_file(commandArgs(TRUE))'"
+    files: '(\.[rR]profile|\.[rR]|\.[rR]md|\.[rR]nw|\.[qQ]md)$'
+    additional_dependencies: 
+      - styler
+      - knitr
+- repo: https://github.com/lorenzwalthert/precommit
+  rev: v0.4.3.9015 
+  hooks:
+    - id: lintr
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,626 @@
+[MAIN]
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Load and enable all available extensions. Use --list-extensions to see a list
+# all available extensions.
+#enable-all-extensions=
+
+# In error mode, messages with a category besides ERROR or FATAL are
+# suppressed, and no reports are done by default. Error mode is compatible with
+# disabling specific errors.
+#errors-only=
+
+# Always return a 0 (non-error) status code, even if lint errors are found.
+# This is primarily useful in continuous integration scripts.
+#exit-zero=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+extension-pkg-whitelist=
+
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+fail-on=
+
+# Specify a score threshold under which the program will exit with error.
+fail-under=10
+
+# Interpret the stdin as a python script, whose filename needs to be passed as
+# the module_or_package argument.
+#from-stdin=
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=CVS
+
+# Add files or directories matching the regular expressions patterns to the
+# ignore-list. The regex matches against paths and can be in Posix or Windows
+# format. Because '\' represents the directory delimiter on Windows systems, it
+# can't be used as an escape character.
+ignore-paths=
+
+# Files or directories matching the regular expression patterns are skipped.
+# The regex matches against base names, not paths. The default value ignores
+# Emacs file locks
+ignore-patterns=^\.#
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+py-version=3.10
+
+# Discover python modules and packages in the file system subtree.
+recursive=no
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# In verbose mode, extra non-checker-related info will be displayed.
+#verbose=
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style. If left empty, argument names will be checked with the set
+# naming style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
+#class-attribute-rgx=
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style. If left empty, class constant names will be checked with
+# the set naming style.
+#class-const-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style. If left empty, class names will be checked with the set naming style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style. If left empty, function names will be checked with the set
+# naming style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           ex,
+           Run,
+           _
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style. If left empty, method names will be checked with the set naming style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style. If left empty, module names will be checked with the set naming style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct type variable names. If left empty, type
+# variable names will be checked with the set naming style.
+#typevar-rgx=
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style. If left empty, variable names will be checked with the set
+# naming style.
+#variable-rgx=
+
+
+[CLASSES]
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[DESIGN]
+
+# List of regular expressions of class ancestor names to ignore when counting
+# public methods (see R0903)
+exclude-too-few-public-methods=
+
+# List of qualified class names to ignore when counting class parents (see
+# R0901)
+ignored-parents=
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when caught.
+overgeneral-exceptions=BaseException,
+                       Exception
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=
+
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
+ext-import-graph=
+
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
+import-graph=
+
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
+# UNDEFINED.
+confidence=HIGH,
+           CONTROL_FLOW,
+           INFERENCE,
+           INFERENCE_FAILURE,
+           UNDEFINED
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        line-too-long,
+        missing-module-docstring,
+        redefined-outer-name
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[METHOD_ARGS]
+
+# List of qualified names (i.e., library.method) which require a timeout
+# parameter e.g. 'requests.api.get,requests.api.post'
+timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+
+# Regular expression of note tags to take in consideration.
+notes-rgx=
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit,argparse.parse_error
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+#output-format=
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[SIMILARITIES]
+
+# Comments are removed from the similarity computation
+ignore-comments=yes
+
+# Docstrings are removed from the similarity computation
+ignore-docstrings=yes
+
+# Imports are removed from the similarity computation
+ignore-imports=yes
+
+# Signatures are removed from the similarity computation
+ignore-signatures=yes
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. Available dictionaries: en_AG (hunspell), en_AU
+# (hunspell), en_BS (hunspell), en_BW (hunspell), en_BZ (hunspell), en_CA
+# (hunspell), en_DK (hunspell), en_GB (hunspell), en_GH (hunspell), en_HK
+# (hunspell), en_IE (hunspell), en_IN (hunspell), en_JM (hunspell), en_MW
+# (hunspell), en_NA (hunspell), en_NG (hunspell), en_NZ (hunspell), en_PH
+# (hunspell), en_SG (hunspell), en_TT (hunspell), en_US (hunspell), en_ZA
+# (hunspell), en_ZM (hunspell), en_ZW (hunspell).
+spelling-dict=
+
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of symbolic message names to ignore for Mixin members.
+ignored-checks-for-mixins=no-member,
+                          not-async-context-manager,
+                          not-context-manager,
+                          attribute-defined-outside-init
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# Regex pattern to define which classes are considered mixins.
+mixin-class-rgx=.*[Mm]ixin
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of names allowed to shadow builtins
+allowed-redefined-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,132 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+[INSERT CONTACT METHOD].
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 OpenPipelines
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,258 @@
+
+
+# OpenPipeline
+
+Extensible single cell analysis pipelines for reproducible and
+large-scale single cell processing using Viash and Nextflow.
+
+[![ViashHub](https://img.shields.io/badge/ViashHub-openpipeline-7a4baa.svg)](https://www.viash-hub.com/packages/openpipeline)
+[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2Fopenpipeline-blue.svg)](https://github.com/openpipelines-bio/openpipeline)
+[![GitHub
+License](https://img.shields.io/github/license/openpipelines-bio/openpipeline.svg)](https://github.com/openpipelines-bio/openpipeline/blob/main/LICENSE)
+[![GitHub
+Issues](https://img.shields.io/github/issues/openpipelines-bio/openpipeline.svg)](https://github.com/openpipelines-bio/openpipeline/issues)
+[![Viash
+version](https://img.shields.io/badge/Viash-v0.9.3-blue.svg)](https://viash.io)
+
+## Documentation
+
+Please find more in-depth documentation on [the
+website](https://openpipelines.bio/).
+
+## Functionality Overview
+
+Openpipelines execute a list of predefined tasks. These descrete steps
+are also provided as standalone components that can be executed
+individually, with a standardized interface. This is especially useful
+when a particular step wraps a tool that you do not necessarily always
+need to execute in a workflow context.
+
+In terms of workflows, the following functionality is provided:
+
+- Demultiplexing: conversion of raw sequencing data to FASTQ objects.
+- [Ingestion](https://openpipelines.bio/fundamentals/architecture.html#sec-ingestion):
+  Read mapping and generating a count matrix.
+- [Single sample
+  processing](https://openpipelines.bio/fundamentals/architecture.html#sec-single-sample):
+  cell filtering and doublet detection.
+- [Multisample
+  processing](https://openpipelines.bio/fundamentals/architecture.html#sec-multisample-processing):
+  Count transformation, normalization, QC metric calulations.
+- [Integration](https://openpipelines.bio/fundamentals/architecture.html#sec-intergration):
+  Clustering, integration and batch correction using single and
+  multimodal methods.
+- Downstream analysis workflows
+
+``` mermaid lang="mermaid"
+flowchart LR
+  demultiplexing["Step 1: Demultiplexing"]
+  ingestion["Step 2: Ingestion"]
+  process_samples["Step 3: Process Samples"]
+  integration["Step 4: Integration"]
+  downstream["Step 5: Downstream"]
+  demultiplexing-->ingestion-->process_samples-->integration-->downstream
+```
+
+## Guided execution using Viash Hub (CLI and Seqera cloud)
+
+Openpipelines is now available on [Viash
+Hub](https://www.viash-hub.com/packages/openpipeline/latest). Viash Hub
+provides a list of components and workflows, together with a graphical
+interface that guides you through the steps of running a workflow or
+standalone component. Intstructions are provided for using a local viash
+or nextflow executable (requires using a linux based OS), but connecting
+to a Seqera cloud instance is also supported.
+
+## Execution using the nextflow executable
+
+Executing a workflow is a bit more involved and requires familiarity
+with the command line interface (CLI).
+
+### Setup
+
+In order to use the workflows in this package on your local computer,
+you’ll need to do the following:
+
+- Install [nextflow](https://www.nextflow.io/docs/latest/install.html)
+- Install a nextflow compatible executor. This workflow provides a
+  profile for [docker](https://docs.docker.com/get-started/).
+
+### Location of the workflow scripts
+
+Nextflow workflow scripts, schema’s and configuration files can be found
+in the `target/nextflow` folder. On the `main` branch however, only the
+source code that needs to be build into the functionning workflows and
+components can be found. Instead, please refer to the `main_build`
+branch or any of the tags to find the `target` folders. Components and
+workflows are organized into namespaces, which can be nested. Workflows
+are located at `target/nextflow/workflows`, while components that
+execute individual workflow steps are
+
+A reference of workflows and modules is also provided in the
+[documentation](https://openpipelines.bio/components/).
+
+### Retrieving a list of a workflow parameters
+
+A list of workflows arguments can be consulted in multiple ways:
+
+- On [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest)
+- In the [reference
+  documentation](https://openpipelines.bio/components/)
+- The config YAML file lists the argument for each workflow and
+  component
+- In the `target/nextflow` folder, a nextflow schema JSON file
+  (`nextflow_schema.json`) is provided next to each workflow `.nf` file.
+- Using nextflow on the CLI:
+
+``` bash
+nextflow run openpipelines-bio/openpipeline \
+    -r 2.1.1 \
+    -main-script target/nextflow/workflows/ingestion/demux/main.nf \
+    --help
+```
+
+### Resource usage tuning
+
+Nextflow’s labels can be used to specify the amount of resources a
+process can use. This workflow uses the following labels for CPU, memory
+and disk:
+
+- `lowmem`, `lowmem`, `midmem`, `highmem`, `veryhighmem`
+- `lowcpu`, `lowcpu`, `midcpu`, `highcpu`, `veryhighcpu`
+- `lowdisk`, `middisk`, `highdisk`, `veryhighdisk`
+
+The defaults for these labels can be found at
+`src/workflows/utils/labels.config`. Nextflow checks that the specified
+resources for a process do not exceed what is available on the machine
+and will not start if it does. Create your own config file to tune the
+labels to your needs, for example:
+
+    // Resource labels
+    withLabel: verylowcpu { cpus = 2 }
+    withLabel: lowcpu { cpus = 8 }
+    withLabel: midcpu { cpus = 16 }
+    withLabel: highcpu { cpus = 16 }
+
+    withLabel: verylowmem { memory = 4.GB }
+    withLabel: lowmem { memory = 8.GB }
+    withLabel: midmem { memory = 16.GB }
+    withLabel: highmem { memory = 32.GB }
+
+When starting nextflow using the CLI, you can use `-c` to provide the
+file to nextflow and overwrite the defaults.
+
+### Demultiplexing example
+
+Here, generating FASTQ files from raw sequencing data is demonstrated,
+based on data generated using 10X genomic’s protocols. However, BD
+genomics data is also supported by Openpipeline. If you wish to try it
+out yourself, test data is available at
+`s3://openpipelines-data/cellranger_tiny_bcl/bcl`.
+
+``` bash
+nextflow run openpipelines-bio/openpipeline \
+    -r 2.1.1 \
+    -main-script target/nextflow/workflows/ingestion/demux/main.nf \
+    -c "<path to resource config file>" \
+    -profile docker \
+    --publish_dir "<path to output directory>" \
+    --id "cellranger_tiny_bcl" \
+    --input "s3://openpipelines-data/cellranger_tiny_bcl/bcl" \
+    --sample_sheet "s3://openpipelines-data/cellranger_tiny_bcl/bcl/sample_sheet.csv" \
+    --demultiplexer "mkfastq"
+```
+
+### Mapping and read counting
+
+FASTQ files can be mapped to a reference genome and the resulting mapped
+reads can be counted in order to generate a count matrix. Both
+`BD Rhapsody` and `Cell Ranger` are supported. Here, we demonstrate
+using Cell Ranger multi on test data available at
+`s3://openpipelines-data/10x_5k_anticmv`.
+
+In order to facilitate passing multiple argument values, the parameters
+can be specified using a YAML file.
+
+``` yaml
+input:
+    - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_*.fastq.gz"
+    - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_*.fastq.gz"
+    - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_*.fastq.gz"
+gex_reference: "s3://openpipelines-data/reference_gencodev41_chr1/reference_cellranger.tar.gz"
+vdj_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"
+feature_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/feature_reference.csv"
+library_id:
+    - "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset"
+    - "5k_human_antiCMV_T_TBNK_connect_AB_subset"
+    - "5k_human_antiCMV_T_TBNK_connect_VDJ_subset"
+library_type:
+    - "Gene Expression"
+    - "Antibody Capture"
+    - "VDJ"
+```
+
+You can pass this file to nextflow using `-params-file`
+
+``` bash
+nextflow run openpipelines-bio/openpipeline \
+    -r 2.1.1 \
+    -main-script target/nextflow/workflows/ingestion/cellranger_multi/main.nf \
+    -c "<path to resource config file>" \
+    -profile docker \
+    -params-file "<path to your parameter YAML file>" \
+    --publish_dir "<path to output directory>"
+```
+
+### Filtering, normalization, clustering, dimensionality reduction and QC calculations (w/o integration)
+
+Once you have an MuData object for each of your samples, you can process
+it into a multisample file that is ready for integration and other
+downstream analyses. This can be done using the `process_samples`
+workflow. Here is an example, but please keep in mind that the exact
+parameters that need to be provided differ depending on you data. A lot
+of functionality for this pipeline can be customized, including the name
+of the output slots where data is being stored.
+
+``` yaml
+param_list:
+    - id: "sample_1"
+      input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
+      rna_min_counts: 2
+    - id: "sample_2"
+      input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
+      rna_min_counts: 1
+rna_max_counts: 1000000
+rna_min_genes_per_cell: 1
+rna_max_genes_per_cell: 1000000
+rna_min_cells_per_gene: 1
+rna_min_fraction_mito: 0.0
+rna_max_fraction_mito: 1.0
+```
+
+In order to provide multiple samples to the pipeline, `param_list` is
+used. Using `param_list` it is possible to specify arguments per sample.
+However, it is still possible to define arguments for all samples
+together by listing those outside the `param_list` block.
+
+``` bash
+nextflow run openpipelines-bio/openpipeline \
+    -r 2.1.1 \
+    -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
+    -c "<path to resource config file>" \
+    -profile docker \
+    -params-file "<path to your parameter YAML file>"
+    --publish_dir "<path to output directory>"
+```
+
+## Executing standalone components using the Viash executable
+
+Another option to execute individual modules on the CLI is to use
+`viash run`. All you need to do is download viash, clone the
+Openpipeline repository and point viash to a config file. However, keep
+in mind that using `viash run` for workflows is currently not supported.
+Please see `viash run --help` for more information on how to use the
+command, but here is an example:
+
+``` bash
+viash run --engine docker src/mapping/cellranger_multi/config.vsh.yaml --help
+```
--- a/README.qmd
+++ b/README.qmd
@@ -0,0 +1,205 @@
+---
+format: gfm
+---
+```{r setup, include=FALSE}
+project <- yaml::read_yaml("_viash.yaml")
+license <- paste0(project$links$repository, "/blob/main/LICENSE")
+```
+# OpenPipeline
+
+Extensible single cell analysis pipelines for reproducible and large-scale single cell processing using Viash and Nextflow.
+
+[![ViashHub](https://img.shields.io/badge/ViashHub-`r project$name`-7a4baa.svg)](https://www.viash-hub.com/packages/`r project$name`) 
+[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2F`r project$name`-blue.svg)](`r project$links$repository`) 
+[![GitHub License](https://img.shields.io/github/license/`r project$organization`/`r project$name`.svg)](`r license`) 
+[![GitHub Issues](https://img.shields.io/github/issues/`r project$organization`/`r project$name`.svg)](`r project$links$issue_tracker`) 
+[![Viash version](https://img.shields.io/badge/Viash-v`r gsub("-", "--", project$viash_version)`-blue.svg)](https://viash.io)
+
+## Documentation
+
+Please find more in-depth documentation on [the website](https://openpipelines.bio/).
+
+## Functionality Overview
+
+Openpipelines execute a list of predefined tasks. These descrete steps are also provided as standalone components that can be executed individually, with a standardized interface. This is especially useful when a particular step wraps a tool that you do not necessarily always need to execute in a workflow context.
+
+In terms of workflows, the following functionality is provided:
+
+* Demultiplexing: conversion of raw sequencing data to FASTQ objects.
+* [Ingestion](https://openpipelines.bio/fundamentals/architecture.html#sec-ingestion): Read mapping and generating a count matrix.
+* [Single sample processing](https://openpipelines.bio/fundamentals/architecture.html#sec-single-sample): cell filtering and doublet detection.
+* [Multisample processing](https://openpipelines.bio/fundamentals/architecture.html#sec-multisample-processing): Count transformation, normalization, QC metric calulations.
+* [Integration](https://openpipelines.bio/fundamentals/architecture.html#sec-intergration): Clustering, integration and batch correction using single and multimodal methods.
+* Downstream analysis workflows
+
+
+```{mermaid lang='mermaid'}
+flowchart LR
+  demultiplexing["Step 1: Demultiplexing"]
+  ingestion["Step 2: Ingestion"]
+  process_samples["Step 3: Process Samples"]
+  integration["Step 4: Integration"]
+  downstream["Step 5: Downstream"]
+  demultiplexing-->ingestion-->process_samples-->integration-->downstream
+```
+
+## Guided execution using Viash Hub (CLI and Seqera cloud)
+
+Openpipelines is now available on [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest). Viash Hub provides a list of components and workflows, together with a graphical interface that guides you through the steps of running a workflow or standalone component. Intstructions are provided for using a local viash or nextflow executable (requires using a linux based OS), but connecting to a Seqera cloud instance is also supported. 
+
+## Execution using the nextflow executable
+
+Executing a workflow  is a bit more involved and requires familiarity with the command line interface (CLI). 
+
+### Setup
+
+In order to use the workflows in this package on your local computer, you'll need to do the following:
+
+* Install [nextflow](https://www.nextflow.io/docs/latest/install.html)
+* Install a nextflow compatible executor. This workflow provides a profile for [docker](https://docs.docker.com/get-started/).
+
+### Location of the workflow scripts
+
+Nextflow workflow scripts, schema's and configuration files can be found in the `target/nextflow` folder.
+On the `main` branch however, only the source code that needs to be build into the functionning workflows and components can be found.
+Instead, please refer to the `main_build` branch or any of the tags to find the `target` folders.
+Components and workflows are organized into namespaces, which can be nested. Workflows are located at `target/nextflow/workflows`,
+while components that execute individual workflow steps are 
+
+A reference of workflows and modules is also provided in the [documentation](https://openpipelines.bio/components/).
+
+### Retrieving a list of a workflow parameters
+
+A list of workflows arguments can be consulted in multiple ways:
+
+* On [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest) 
+* In the [reference documentation](https://openpipelines.bio/components/)
+* The config YAML file lists the argument for each workflow and component
+* In the `target/nextflow` folder, a nextflow schema JSON file (`nextflow_schema.json`) is provided next to each workflow `.nf` file.
+* Using nextflow on the CLI: 
+
+```bash
+nextflow run openpipelines-bio/openpipeline \
+    -r 2.1.1 \
+    -main-script target/nextflow/workflows/ingestion/demux/main.nf \
+    --help
+```
+
+### Resource usage tuning
+
+Nextflow's labels can be used to specify the amount of resources a process can use. This workflow uses the following labels for CPU, memory and disk:
+
+* `lowmem`, `lowmem`, `midmem`, `highmem`, `veryhighmem`
+* `lowcpu`, `lowcpu`, `midcpu`, `highcpu`, `veryhighcpu`
+* `lowdisk`, `middisk`, `highdisk`, `veryhighdisk`
+
+The defaults for these labels can be found at `src/workflows/utils/labels.config`. Nextflow checks that the specified resources for a process do not exceed what is available on the machine and will not start if it does. Create your own config file to tune the labels to your needs, for example:
+
+```
+// Resource labels
+withLabel: verylowcpu { cpus = 2 }
+withLabel: lowcpu { cpus = 8 }
+withLabel: midcpu { cpus = 16 }
+withLabel: highcpu { cpus = 16 }
+
+withLabel: verylowmem { memory = 4.GB }
+withLabel: lowmem { memory = 8.GB }
+withLabel: midmem { memory = 16.GB }
+withLabel: highmem { memory = 32.GB }
+```
+
+When starting nextflow using the CLI, you can use `-c` to provide the file to nextflow and overwrite the defaults.
+
+### Demultiplexing example
+
+Here, generating FASTQ files from raw sequencing data is demonstrated, based on data generated using 10X genomic's protocols. However, BD genomics data is also supported by Openpipeline. If you wish to try it out yourself, test data is available at `s3://openpipelines-data/cellranger_tiny_bcl/bcl`.
+
+```bash
+nextflow run openpipelines-bio/openpipeline \
+    -r 2.1.1 \
+    -main-script target/nextflow/workflows/ingestion/demux/main.nf \
+    -c "<path to resource config file>" \
+    -profile docker \
+    --publish_dir "<path to output directory>" \
+    --id "cellranger_tiny_bcl" \
+    --input "s3://openpipelines-data/cellranger_tiny_bcl/bcl" \
+    --sample_sheet "s3://openpipelines-data/cellranger_tiny_bcl/bcl/sample_sheet.csv" \
+    --demultiplexer "mkfastq"
+```
+
+### Mapping and read counting
+
+FASTQ files can be mapped to a reference genome and the resulting mapped reads can be counted in order to generate a count matrix. Both `BD Rhapsody` and `Cell Ranger` are supported. Here, we demonstrate using Cell Ranger multi on test data available at `s3://openpipelines-data/10x_5k_anticmv`. 
+
+
+In order to facilitate passing multiple argument values, the parameters can be specified using a YAML file.
+```yaml
+input:
+    - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_*.fastq.gz"
+    - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_*.fastq.gz"
+    - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_*.fastq.gz"
+gex_reference: "s3://openpipelines-data/reference_gencodev41_chr1/reference_cellranger.tar.gz"
+vdj_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"
+feature_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/feature_reference.csv"
+library_id:
+    - "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset"
+    - "5k_human_antiCMV_T_TBNK_connect_AB_subset"
+    - "5k_human_antiCMV_T_TBNK_connect_VDJ_subset"
+library_type:
+    - "Gene Expression"
+    - "Antibody Capture"
+    - "VDJ"
+```
+You can pass this file to nextflow using `-params-file`
+
+```bash
+nextflow run openpipelines-bio/openpipeline \
+    -r 2.1.1 \
+    -main-script target/nextflow/workflows/ingestion/cellranger_multi/main.nf \
+    -c "<path to resource config file>" \
+    -profile docker \
+    -params-file "<path to your parameter YAML file>" \
+    --publish_dir "<path to output directory>"
+```
+
+### Filtering, normalization, clustering, dimensionality reduction and QC calculations (w/o integration)
+
+Once you have an MuData object for each of your samples, you can process it into a multisample file that is ready for integration and other downstream analyses. This can be done using the `process_samples` workflow. Here is an example, but please keep in mind that the exact parameters that need to be provided differ depending on you data. A lot of functionality for this pipeline can be customized, including the name of the output slots where data is being stored. 
+
+```yaml
+param_list:
+    - id: "sample_1"
+      input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
+      rna_min_counts: 2
+    - id: "sample_2"
+      input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
+      rna_min_counts: 1
+rna_max_counts: 1000000
+rna_min_genes_per_cell: 1
+rna_max_genes_per_cell: 1000000
+rna_min_cells_per_gene: 1
+rna_min_fraction_mito: 0.0
+rna_max_fraction_mito: 1.0
+```
+
+In order to provide multiple samples to the pipeline, `param_list` is used. Using `param_list` it is possible to specify arguments per sample. However, it is still possible to define arguments for all samples together by listing those outside the `param_list` block.
+
+```bash
+nextflow run openpipelines-bio/openpipeline \
+    -r 2.1.1 \
+    -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
+    -c "<path to resource config file>" \
+    -profile docker \
+    -params-file "<path to your parameter YAML file>"
+    --publish_dir "<path to output directory>"
+```
+
+
+## Executing standalone components using the Viash executable
+
+Another option to execute individual modules on the CLI is to use `viash run`. All you need to do is download viash, clone the Openpipeline repository and point viash to a config file. However, keep in mind that using `viash run` for workflows is currently not supported. Please see `viash run --help` for more information on how to use the command, but here is an example:
+
+
+```bash
+viash run --engine docker src/mapping/cellranger_multi/config.vsh.yaml --help
+```
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -0,0 +1,40 @@
+viash_version: 0.9.7
+source: src
+target: target
+# Note: this causes the docker images to be renamed
+name: openpipeline
+organization: vsh
+summary: |
+  Best-practice workflows for single-cell multi-omics analyses.
+description: |
+  OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).
+
+  In terms of workflows, the following has been made available, but keep in mind that
+  individual tools and functionality can be executed as standalone components as well.
+
+    * Demultiplexing: conversion of raw sequencing data to FASTQ objects.
+    * Ingestion: Read mapping and generating a count matrix.
+    * Single sample processing: cell filtering and doublet detection.
+    * Multisample processing: Count transformation, normalization, QC metric calulations.
+    * Integration: Clustering, integration and batch correction using single and multimodal methods.
+    * Downstream analysis workflows
+license: MIT
+keywords: [single-cell, multimodal]
+links:
+  repository: https://github.com/openpipelines-bio/openpipeline
+  docker_registry: ghcr.io
+  homepage: https://openpipelines.bio
+  documentation: https://openpipelines.bio/fundamentals
+  issue_tracker: https://github.com/openpipelines-bio/openpipeline/issues
+info:
+  test_resources:
+    - type: s3
+      path: s3://openpipelines-data
+      dest: resources_test
+  nextflow_labels_ci:
+    - path: src/workflows/utils/labels_ci.config
+      description: Adds the correct memory and CPU labels when running on the Viash Hub CI.
+config_mods: |
+  .resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}
+  .runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
+version: processing-workflows-quantile-filtering
--- a/images/concepts/fig.svg
+++ b/images/concepts/fig.svg
--- a/images/concepts/fig_cell.svg
+++ b/images/concepts/fig_cell.svg
--- a/images/concepts/fig_modality_adt.svg
+++ b/images/concepts/fig_modality_adt.svg
--- a/images/concepts/fig_modality_atac.svg
+++ b/images/concepts/fig_modality_atac.svg
--- a/images/concepts/fig_modality_rna.svg
+++ b/images/concepts/fig_modality_rna.svg
--- a/images/concepts/fig_modality_vdj.svg
+++ b/images/concepts/fig_modality_vdj.svg
--- a/images/concepts/fig_workflow_multiomics_adt_multisample.svg
+++ b/images/concepts/fig_workflow_multiomics_adt_multisample.svg
--- a/images/concepts/fig_workflow_multiomics_adt_singlesample.svg
+++ b/images/concepts/fig_workflow_multiomics_adt_singlesample.svg
--- a/images/concepts/fig_workflow_multiomics_rna_multisample.svg
+++ b/images/concepts/fig_workflow_multiomics_rna_multisample.svg
--- a/images/concepts/fig_workflow_multiomics_rna_singlesample.svg
+++ b/images/concepts/fig_workflow_multiomics_rna_singlesample.svg
--- a/images/concepts/generate_subimages.sh
+++ b/images/concepts/generate_subimages.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# so let's do it separately
+rm images/concepts/fig_*.svg
+
+for id in cell modality_rna modality_adt modality_vdj modality_atac workflow_multiomics_rna_singlesample workflow_multiomics_rna_multisample workflow_multiomics_adt_singlesample workflow_multiomics_adt_multisample; do
+  inkscape --export-type="svg" --export-id="$id" --export-id-only images/concepts/fig.svg
+  svgo images/concepts/fig_${id}.svg
+done
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,5 @@
+nextflow.enable.dsl=2
+
+workflow {
+    print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.")
+}
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,22 @@
+// template nextflow.config for nested workflows
+
+manifest {
+  nextflowVersion = '!>=20.12.1-edge'
+}
+
+// TODO 1: unquote and adapt `rootDir` according to relative path within project
+// params {
+//   rootDir = "$projectDir/../.."
+// }
+// 
+// workflowDir = "${params.rootDir}/workflows"
+// targetDir = "${params.rootDir}/target/nextflow"
+
+// TODO 2: insert custom imports here
+
+// TODO 3: unquote
+// docker {
+//   runOptions = "-v \$(realpath ${params.rootDir}):\$(realpath ${params.rootDir})"
+// }
+
+
--- a/resources_test_scripts/10x_20k_fixed.sh
+++ b/resources_test_scripts/10x_20k_fixed.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_5k_fixed
+OUT="resources_test/$ID"
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# create tempdir
+MY_TEMP="${VIASH_TEMP:-/tmp}"
+TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+
+# dataset page:
+# https://www.10xgenomics.com/datasets/mixture-of-healthy-and-cancer-ffpe-tissues-dissociated-using-miltenyi-ffpe-tissue-dissociation-kit-multiplexed-samples-4-probe-barcodes-1-standard
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.1.0/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 200000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
+
+seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R2_001.fastq.gz"
+
+# download feature reference
+feature_ref="$raw_dir/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv"
+if [[ ! -f "$feature_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref"
+fi
+
+# download probe set
+probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv"
+if [[ ! -f "$probe_set" ]]; then
+  wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv" -O "$probe_set"
+fi
+
+sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set"
+
+probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv"
+if [[ ! -f "$probe_set_corrected" ]]; then
+  reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
+  gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf" 
+  cat "$probe_set" | while read line || [[ -n $line ]];
+  do
+    echo "Line: $line"
+    old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' )
+    echo "Old ID: $old_id"
+    if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then
+      echo "Just writing line"
+      printf "%s\n" "$line" >> "$probe_set_corrected"
+    else
+      gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;)
+      if [ ! -z "$gtf_lookup" ]; then
+        echo "Found hit"
+        new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
+        echo "New ID: $new_id"
+        new_line=${line/"$old_id"/"$new_id"}
+        printf "%s\n" "$new_line" >> "$probe_set_corrected"
+      else
+        echo "Did not find hit"
+      fi
+    fi
+  done
+fi
+
+# Input FASTA:
+#   >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
+# Output FASTA:
+#   >chr1 1
+input_fastq="$HOME/.cache/openpipeline/GRCh38.primary_assembly.genome.fa.gz"
+fasta_modified="$TMPDIR/GRCh38.primary_assembly.genome.modified.fa"
+if [[ ! -f "$input_fastq" ]]; then
+  wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" -O "$input_fastq"
+fi
+zcat "$input_fastq" \
+    | sed -E 's/^>(\S+).*/>\1 \1/' \
+    | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
+    | sed -E 's/^>MT />chrM /' \
+    > "$fasta_modified"
+
+pigz --fast "$fasta_modified"
+fasta_modified="$fasta_modified.gz"
+# Input GTF:
+#     ... gene_id "ENSG00000223972.5"; ...
+# Output GTF:
+#     ... gene_id "ENSG00000223972"; gene_version "5"; ...
+input_gtf="$HOME/.cache/openpipeline/gencode.v41.annotation.gtf.gz"
+gtf_modified="$TMPDIR/gencode.v41.annotation.gtf.modified.gtf"
+if [[ ! -f "$input_gtf" ]]; then
+  wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" -O "$input_gtf"
+fi
+
+REGEX="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)"
+zcat "$input_gtf" \
+    | sed -E 's/gene_id "'"$REGEX"'";/gene_id "\1"; gene_version "\3";/' \
+    | sed -E 's/transcript_id "'"$REGEX"'";/transcript_id "\1"; transcript_version "\3";/' \
+    | sed -E 's/exon_id "'"$REGEX"'";/exon_id "\1"; exon_version "\3";/' \
+    > "$gtf_modified"
+pigz --fast "$gtf_modified"
+gtf_modified="$gtf_modified.gz"
+
+final_genome="$HOME/.cache/openpipeline/GRCh38.cellranger.genome.fa.gz"
+if [ ! -f "$final_genome" ]; then
+  nextflow \
+    run openpipelines-bio/openpipeline \
+    -r 3.0.0 \
+    -main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
+    -profile docker \
+    -resume \
+    --id "GRCh38" \
+    --genome_fasta "$fasta_modified" \
+    --transcriptome_gtf "$gtf_modified" \
+    --target "cellranger" \
+    --output_fasta "reference.fa.gz" \
+    --output_gtf "reference.gtf.gz" \
+    --output_cellranger "GRCh38.cellranger.genome.fa.gz" \
+    --publish_dir "$HOME/.cache/openpipeline/"
+fi
+
+
+# Run mapping pipeline
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - ${orig_sample_id}_subset
+  library_type:
+    - "Gene Expression"
+  library_lanes:
+    - "any"
+
+probe_set: "$probe_set_corrected"
+gex_reference: "$genome_tar"
+feature_reference: "$feature_ref"
+probe_barcode_ids:
+  - BC001
+  - BC002
+  - BC003
+  - BC004
+sample_ids:
+  - Liver_BC1
+  - Ovarian_BC2
+  - Colorectal_BC3
+  - Pancreas_BC4
+gex_generate_bam: false
+sample_force_cells:
+  - 5000
+  - -1
+  - -1
+  - -1
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels_ci.config \
+  --publish_dir "$OUT/processed"
+
+mkdir -p "${OUT}/processed"
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels_ci.config \
+  --publish_dir "${OUT}_v10/processed"
--- a/resources_test_scripts/10x_4plex_dtc.sh
+++ b/resources_test_scripts/10x_4plex_dtc.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_4plex_dtc
+OUT="resources_test/$ID"
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# create tempdir
+MY_TEMP="${VIASH_TEMP:-/tmp}"
+TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+
+# dataset page:
+# https://www.10xgenomics.com/datasets/40k-mixture-of-dissociated-tumor-cells-from-3-donors-stained-with-totalseqc-antibodies
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/4plex_DTC_kidney_lung_breast_TotalSeqC_fastqs"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.2.0/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 100000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="4plex_DTC_kidney_lung_breast_TotalSeqC"
+
+seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L002_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L002_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L002_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L002_R2_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L003_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L003_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L003_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L003_R2_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L004_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L004_R2_001.fastq.gz"
+
+
+seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L002_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L002_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L002_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L002_R2_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L003_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L003_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L003_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L003_R2_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L004_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L004_R2_001.fastq.gz"
+
+# download feature reference
+feature_ref="$raw_dir/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv"
+if [[ ! -f "$feature_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref"
+fi
+
+# download probe set
+probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A.csv"
+if [[ ! -f "$probe_set" ]]; then
+  wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A.csv" -O "$probe_set"
+fi
+
+sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set"
+
+probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A-corrected.csv"
+if [[ ! -f "$probe_set_corrected" ]]; then
+  reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
+  gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf" 
+  cat "$probe_set" | while read line || [[ -n $line ]];
+  do
+    echo "Line: $line"
+    old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' )
+    echo "Old ID: $old_id"
+    if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then
+      echo "Just writing line"
+      printf "%s\n" "$line" >> "$probe_set_corrected"
+    else
+      gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;)
+      if [ ! -z "$gtf_lookup" ]; then
+        echo "Found hit"
+        new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
+        echo "New ID: $new_id"
+        new_line=${line/"$old_id"/"$new_id"}
+        printf "%s\n" "$new_line" >> "$probe_set_corrected"
+      else
+        echo "Did not find hit"
+      fi
+    fi
+  done
+fi
+
+# Run mapping pipeline
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - ${orig_sample_id}_gex_subset
+    - ${orig_sample_id}_ab_subset
+  library_type:
+    - "Gene Expression"
+    - "Antibody Capture"
+  library_lanes:
+    - "any"
+    - "any"
+
+probe_set: "$probe_set_corrected"
+gex_reference: "$genome_tar"
+feature_reference: "$feature_ref"
+probe_barcode_ids:
+  - BC001+AB001
+  - BC002+AB002
+  - BC003+AB003
+  - BC004+AB004
+sample_ids:
+  - Kidney_Cancer1_BC1_AB1
+  - Kidney_Cancer2_BC2_AB2
+  - Breast_Cancer_BC3_AB3
+  - Lung_Cancer_BC4_AB4
+gex_generate_bam: false
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  --publish_dir "$OUT/processed" \
+  -c src/workflows/utils/labels_ci.config
+
+mkdir -p "${OUT}_v10/processed"
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  --publish_dir "${OUT}_v10/processed" \
+  -c src/workflows/utils/labels_ci.config
--- a/resources_test_scripts/10x_5k_anticmv.sh
+++ b/resources_test_scripts/10x_5k_anticmv.sh
@@ -0,0 +1,232 @@
+#!/bin/bash
+
+set -eo pipefail
+
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_5k_anticmv
+OUT=resources_test/$ID
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# dataset page:
+# https://www.10xgenomics.com/resources/datasets/integrated-gex-totalseqc-and-tcr-analysis-of-connect-generated-library-from-5k-cmv-t-cells-2-standard
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/5k_human_antiCMV_T_TBNK_connect_Multiplex"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 200000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="5k_human_antiCMV_T_TBNK_connect"
+
+seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R2_001.fastq.gz"
+
+seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R1_001.fastq.gz"
+seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R2_001.fastq.gz"
+
+seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R2_001.fastq.gz"
+
+# download immune panel fasta if needed
+feature_reference="$raw_dir/feature_reference.csv"
+if [[ ! -f "$feature_reference" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_count_feature_reference.csv" -O "$feature_reference"
+fi
+
+# download vdj reference if needed
+vdj_ref="$raw_dir/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"
+if [[ ! -f "$vdj_ref" ]]; then
+  wget "https://cf.10xgenomics.com/supp/cell-vdj/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" -O "$vdj_ref"
+fi
+
+
+# Run mapping pipeline
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - "${orig_sample_id}_GEX_1_subset"
+    - "${orig_sample_id}_AB_subset"
+    - "${orig_sample_id}_VDJ_subset"
+  library_type:
+    - "Gene Expression"
+    - "Antibody Capture"
+    - "VDJ"
+
+gex_reference: "$genome_tar"
+vdj_reference: "$vdj_ref"
+feature_reference: "$feature_reference"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -r 3.0.0 \
+  -resume \
+  --publish_dir "$OUT/processed" \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config \
+  -c src/workflows/utils/errorstrat_ignore.config
+
+mkdir -p "${OUT}_v10/processed"
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  --publish_dir "${OUT}_v10/processed" \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config \
+  -c src/workflows/utils/errorstrat_ignore.config
+
+# Convert to h5mu
+cat > /tmp/params.yaml << HERE
+id: "$orig_sample_id"
+input: "$OUT/processed/10x_5k_anticmv.cellranger_multi.output"
+publish_dir: "$OUT/"
+output: "*.h5mu"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config
+
+mv "$OUT/0.h5mu" "$OUT/${orig_sample_id}.h5mu"
+
+
+# run qc workflow
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/$orig_sample_id.h5mu"
+var_name_mitochondrial_genes: mitochondrial
+var_name_ribosomal_genes: ribosomal
+publish_dir: "$OUT/"
+output: "${orig_sample_id}_qc.h5mu"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/workflows/qc/qc/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config
+
+
+# Run full pipeline
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/${orig_sample_id}_qc.h5mu"
+publish_dir: "$OUT/"
+output: "${orig_sample_id}_mms.h5mu"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config
+
+# create fastqc directory
+fastqc_dir="$OUT/fastqc"
+mkdir -p "$fastqc_dir"
+
+./target/executable/qc/fastqc/fastqc \
+  --input "$raw_dir" \
+  --mode "dir" \
+  --output "$fastqc_dir"
+
+
+# Create a test dataset for the Custom modality
+# by just labeling the AB as custom
+feat_ref_name=$(basename $feature_reference)
+sed -e 's/Antibody Capture/Custom/g' "$feature_reference" > "/tmp/custom_${feat_ref_name}"
+
+cat > /tmp/params_custom.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - "${orig_sample_id}_GEX_1_subset"
+    - "${orig_sample_id}_AB_subset"
+    - "${orig_sample_id}_VDJ_subset"
+  library_type:
+    - "Gene Expression"
+    - "Custom"
+    - "VDJ"
+
+gex_reference: "$genome_tar"
+feature_reference: "/tmp/custom_${feat_ref_name}"
+vdj_reference: "$vdj_ref"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params_custom.yaml \
+  -c src/workflows/utils/labels.config \
+  -c src/workflows/utils/errorstrat_ignore.config \
+  --publish_dir "${OUT}/processed_with_custom"
+
+
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params_custom.yaml \
+  -c src/workflows/utils/labels.config \
+  -c src/workflows/utils/errorstrat_ignore.config \
+  --publish_dir "${OUT}_v10/processed_with_custom"
--- a/resources_test_scripts/10x_5k_beam.sh
+++ b/resources_test_scripts/10x_5k_beam.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_5k_beam
+OUT="resources_test/$ID"
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# dataset page:
+# https://www.10xgenomics.com/datasets/5k-human-a0201-b0702-pbmcs-beam-t-2-standard
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/5k_human_A0201_B0702_PBMCs_BEAM_T"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 200000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="beamt_human_A0201_B0702_pbmc"
+
+seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R2_001.fastq.gz"
+
+seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R1_001.fastq.gz" 
+seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R2_001.fastq.gz" 
+
+seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R1_001.fastq.gz" 
+seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R2_001.fastq.gz" 
+
+# download feature reference
+feature_ref="$raw_dir/beamt_human_A0201_B0702_pbmc_feature_reference.csv"
+if [[ ! -f "$feature_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_count_feature_reference.csv" -O "$feature_ref"
+fi
+
+# download vdj reference if needed
+vdj_ref="$raw_dir/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz"
+if [[ ! -f "$vdj_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" -O "$vdj_ref"
+fi
+
+# Run mapping pipeline
+# TODO: Also include conversion to h5mu
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - "${orig_sample_id}_gex_subset"
+    - "${orig_sample_id}_vdj_subset"
+    - "${orig_sample_id}_ag_subset"
+  library_type:
+    - "Gene Expression"
+    - "VDJ-T"
+    - "Antigen Capture"
+
+gex_reference: "$genome_tar"
+feature_reference: "$feature_ref"
+vdj_reference: "$vdj_ref"
+control_id:
+    - negative_control_A0201
+    - negative_control_B0702
+mhc_allele:
+    - "HLA-A*02:01"
+    - "HLA-B*07:02"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels_ci.config \
+  --publish_dir "$OUT/processed"
+
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels_ci.config \
+  --publish_dir "${OUT}_v10/processed"
+
+# Create h5mu
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/processed/$ID.cellranger_multi.output"
+publish_dir: "$OUT/"
+output: "*.h5mu"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels_ci.config
+
+mv "$OUT/0.h5mu" "$OUT/${orig_sample_id}.h5mu"
--- a/resources_test_scripts/10x_5k_lung_crispr.sh
+++ b/resources_test_scripts/10x_5k_lung_crispr.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=10x_5k_lung_crispr
+OUT="resources_test/$ID"
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1/"
+genome_tar="$reference_dir/reference_cellranger.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# dataset page:
+# https://www.10xgenomics.com/resources/datasets/5-k-a-549-lung-carcinoma-cells-no-treatment-transduced-with-a-crispr-pool-3-1-standard-6-0-0
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+
+    # download fastqs and untar
+    wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_fastqs.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing `basename $input`"
+    seqkit head -n 200000 "$input" | gzip > "$output"
+  fi
+}
+
+orig_sample_id="SC3_v3_NextGem_DI_CRISPR_A549_5K"
+
+seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R2_001.fastq.gz"
+
+seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R1_001.fastq.gz"
+seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R2_001.fastq.gz"
+
+
+# download crispr feature reference
+crispr_ref="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv"
+if [[ ! -f "$crisp_ref" ]]; then
+  wget "https://cf.10xgenomics.com/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv" -O "$crispr_ref"
+fi
+
+crispr_ref_adjusted="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference_corrected.csv"
+reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
+cat "$crispr_ref" | while read line || [[ -n $line ]];
+do 
+  echo "Line: $line"
+  old_id=$( printf "%s\n" "$line" | awk -F',' '{print $7}' )
+  echo "Old ID: $old_id"
+  if [ "$old_id" = "Non-Targeting" ] || [ "$old_id" = "target_gene_id" ] ; then
+    echo "Just writing line"
+    printf "%s\n" "$line" >> "$crispr_ref_adjusted"
+  else
+    gtf_lookup=$(zgrep "$old_id" "$reference_gtf" || test $? = 1;)
+    if [ ! -z "$gtf_lookup" ]; then
+      echo "Found hit"
+      new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
+      echo "New ID: $new_id"
+      new_line=${line/"$old_id"/"$new_id"}
+      printf "%s\n" "$new_line" >> "$crispr_ref_adjusted"
+    else
+      echo "Did not find hit"
+    fi
+  fi
+done
+
+
+# Run mapping pipeline
+# As of Cell Ranger v10, gex_secondary_analysis is required
+# to generate CRISPR output. 
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - "${orig_sample_id}_gex_subset"
+    - "${orig_sample_id}_crispr_subset"
+  library_type:
+    - "Gene Expression"
+    - "CRISPR Guide Capture"
+
+gex_reference: "$genome_tar"
+feature_reference: "$crispr_ref_adjusted"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config \
+  --publish_dir "${OUT}/processed"
+
+mkdir -p "${OUT}_v10/processed"
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config \
+  --publish_dir "${OUT}_v10/processed" \
+  --gex_secondary_analysis
+
+# Create h5mu
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/processed/10x_5k_lung_crispr.cellranger_multi.output"
+output: "*.h5mu"
+HERE
+
+nextflow \
+  run openpipelines-bio/openpipeline \
+  -r 3.0.0 \
+  -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config \
+  --publish_dir "$OUT/"
+mv "${OUT}/0.h5mu"  "${OUT}/$orig_sample_id.h5mu"
+
--- a/resources_test_scripts/annotation_test_data.sh
+++ b/resources_test_scripts/annotation_test_data.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+ID=annotation_test_data
+OUT=resources_test/$ID/
+
+# ideally, this would be a versioned pipeline run
+[ -d "$OUT" ] || mkdir -p "$OUT"
+
+# Download Tabula Sapiens Blood reference h5ad from https://doi.org/10.5281/zenodo.7587774
+wget "https://zenodo.org/record/7587774/files/TS_Blood_filtered.h5ad?download=1" -O "${OUT}/tmp_TS_Blood_filtered.h5ad"
+
+# Download Tabula Sapiens Blood pretrained model from https://doi.org/10.5281/zenodo.7580707
+wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz?download=1" -O "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
+
+# Download PopV specific CL ontology files - needed for OnClass
+# OUT_ONTOLOGY="${OUT}/ontology"
+# [ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY"
+# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.obo \
+# -O "${OUT_ONTOLOGY}/cl.obo"
+# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology \
+# -O "${OUT_ONTOLOGY}/cl.ontology"
+# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology.nlp.emb \
+# -O "${OUT_ONTOLOGY}/cl.ontology.nlp.emb"
+
+
+# Process Tabula Sapiens Blood reference h5ad
+# (Select one individual and 100 cells per cell type)
+# normalize and log1p transform data
+# Add treatment and disease columns
+python <<HEREDOC
+import anndata as ad
+import scanpy as sc
+import numpy as np
+
+# Read in data
+ref_adata = ad.read_h5ad("${OUT}/tmp_TS_Blood_filtered.h5ad")
+sub_ref_adata = ref_adata[ref_adata.obs["donor_assay"] == "TSP14_10x 3' v3"] 
+n=100
+s=sub_ref_adata.obs.groupby('cell_ontology_class').cell_ontology_class.transform('count')
+sub_ref_adata_final = sub_ref_adata[sub_ref_adata.obs[s>=n].groupby('cell_ontology_class').head(n).index]
+
+# Normalize and log1p transform data
+data_for_scanpy = ad.AnnData(X=sub_ref_adata_final.X)
+sc.pp.normalize_total(data_for_scanpy, target_sum=10000)
+sc.pp.log1p(
+    data_for_scanpy,
+    base=None,
+    layer=None,
+    copy=False,
+)
+sub_ref_adata_final.layers["log_normalized"] = data_for_scanpy.X
+
+# Add treatment and disease columns
+n_cells = sub_ref_adata_final.n_obs
+treatment = np.random.choice(["ctrl", "stim"], size=n_cells, p=[0.5, 0.5])
+disease = np.random.choice(["healthy", "diseased"], size=n_cells, p=[0.5, 0.5])
+sub_ref_adata_final.obs["treatment"] = treatment
+sub_ref_adata_final.obs["disease"] = disease
+
+# Strip raw slot - not needed for annotation and causes compatibility issues between AnnData/MuData versions
+sub_ref_adata_final = sub_ref_adata_final.copy()
+sub_ref_adata_final.raw = None
+
+# Write out data
+sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
+HEREDOC
+
+
+echo "> Converting to h5mu"
+viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5ad" \
+    --output "${OUT}/TS_Blood_filtered.h5mu" \
+    --modality "rna"
+
+rm "${OUT}/tmp_TS_Blood_filtered.h5ad"
+
+echo "> Downloading pretrained CellTypist model and sample test data"
+wget https://celltypist.cog.sanger.ac.uk/models/Pan_Immune_CellTypist/v2/Immune_All_Low.pkl \
+    -O "${OUT}/celltypist_model_Immune_All_Low.pkl"
+wget https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_2000_cells.h5ad \
+    -O "${OUT}/demo_2000_cells.h5ad"
+viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/demo_2000_cells.h5ad" \
+    --output "${OUT}/demo_2000_cells.h5mu" \
+    --modality "rna"
+
+
+echo "> Fetching OnClass data and models"
+OUT_ONTOLOGY="${OUT}/ontology"
+[ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY"
+wget https://figshare.com/ndownloader/files/28394466 -O "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz"
+tar -xzvf "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz" -C "${OUT_ONTOLOGY}" --strip-components=2
+rm "${OUT_ONTOLOGY}/allen.ontology"
+rm "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz"
+
+wget https://figshare.com/ndownloader/files/28394541 -O "${OUT}/OnClass_models.tar.gz"
+tar -xzvf "${OUT}/OnClass_models.tar.gz" -C "${OUT}" --strip-components=1
+rm "${OUT}/OnClass_models.tar.gz"
+rm "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
+
+find "${OUT}/Pretrained_model" ! -name "example_file_model*" -type f -exec rm -f {} +
+mv "${OUT}/Pretrained_model" "${OUT}/onclass_model"
+
+echo "> Creating simple SCVI model"
+viash run src/integrate/scvi/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5mu" \
+    --obs_batch "donor_id" \
+    --var_gene_names "ensemblid" \
+    --output "${OUT}/scvi_output.h5mu" \
+    --output_model "${OUT}/scvi_model" \
+    --max_epochs 5 \
+    --n_obs_min_count 10 \
+    --n_var_min_count 10
+
+echo "> Creating SCVI model with covariates"
+viash run src/integrate/scvi/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5mu" \
+    --obs_batch "donor_id" \
+    --var_gene_names "ensemblid" \
+    --obs_categorical_covariate "assay" \
+    --obs_categorical_covariate "donor_assay" \
+    --output "${OUT}/scvi_covariate_output.h5mu" \
+    --output_model "${OUT}/scvi_covariate_model" \
+    --max_epochs 5 \
+    --n_obs_min_count 10 \
+    --n_var_min_count 10
+
+echo "> Creating simple SCANVI model"
+viash run src/annotate/scanvi/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5mu" \
+    --var_gene_names "ensemblid" \
+    --obs_labels "cell_ontology_class" \
+    --scvi_model "${OUT}/scvi_model" \
+    --output "${OUT}/scanvi_output.h5mu" \
+    --output_model "${OUT}/scanvi_model" \
+    --max_epochs 5 
+
+echo "> Creating SCANVI model with covariates"
+viash run src/annotate/scanvi/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5mu" \
+    --var_gene_names "ensemblid" \
+    --obs_labels "cell_ontology_class" \
+    --scvi_model "${OUT}/scvi_covariate_model" \
+    --output "${OUT}/scanvi_covariate_output.h5mu" \
+    --output_model "${OUT}/scanvi_covariate_model" \
+    --max_epochs 5 
+
+rm "${OUT}/scanvi_output.h5mu"
+rm "${OUT}/scanvi_covariate_output.h5mu"
+rm "${OUT}/scvi_output.h5mu"
+rm "${OUT}/scvi_covariate_output.h5mu"
+rm -r "${OUT}/Pretrained_model/"
+
+echo "> Creating Pseudobulk Data for DGEA"
+viash run src/differential_expression/create_pseudobulk/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5mu" \
+    --obs_grouping "cell_type" \
+    --obs_sample_conditions "donor_id" \
+    --obs_sample_conditions "treatment" \
+    --obs_sample_conditions "disease" \
+    --min_num_cells_per_sample 5 \
+    --output "${OUT}/TS_Blood_filtered_pseudobulk.h5mu" 
--- a/resources_test_scripts/aws_sync.sh
+++ b/resources_test_scripts/aws_sync.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -eo pipefail
+
+aws s3 sync --profile di "resources_test" "s3://openpipelines-data" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun
+
+id=cellranger_tiny_fastq
+aws s3 sync --profile di "resources_test/$id" "s3://openpipelines-data/$id" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun
--- a/resources_test_scripts/bdrhap_5kjrt.sh
+++ b/resources_test_scripts/bdrhap_5kjrt.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=bdrhap_5kjrt
+OUT=resources_test/$ID
+n_threads=30
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# check whether reference is available
+reference_dir="resources_test/reference_gencodev41_chr1"
+genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz"
+if [[ ! -f "$genome_tar" ]]; then
+    echo "$genome_tar does not exist. Please create the reference genome first"
+    exit 1
+fi
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/12WTA-ABC-SMK-EB-5kJRT"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+    wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/12WTA-ABC-SMK-EB-5kJRT.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+genome_dir="$raw_dir/temp_reference_gencodev41_chr1"
+if [[ ! -d "$genome_dir" ]]; then
+  echo "> Untarring genome"
+  mkdir -p "$genome_dir"
+  tar -xvf "$genome_tar" -C "$genome_dir"
+fi
+
+# process WTA fastq files
+# map to chr1, subsample chr1 reads 
+mapping_dir="$raw_dir/temp_mapping_chr_1"
+if [[ ! -f "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" ]]; then
+  echo "> Processing 12WTA_S1_L432_R[12]_001.fastq.gz"
+  mkdir -p "$mapping_dir"
+  # MUST USE A STAR THAT IS COMPATIBLE WITH BD RHAPSODY
+  # For the cwl pipeline 1.9.1, 2.5.2b should work.
+  echo "star"
+  docker run --rm -i \
+    -v "`pwd`/$OUT:`pwd`/$OUT" \
+    -v "$tar_dir:$tar_dir" \
+    -w `pwd` bdgenomics/rhapsody:1.10.1 \
+    STAR \
+      --runThreadN "$n_threads" \
+      --genomeDir "$genome_dir" \
+      --readFilesIn "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" \
+      --runRNGseed 100 \
+      --outFileNamePrefix "$mapping_dir/" \
+      --readFilesCommand "gzip -d -k -c" \
+      --clip3pAdapterSeq "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
+      --outFilterMatchNmin "25" \
+      --quantTranscriptomeBan "Singleend" # Prohibit mapping of one side of the read
+  # chown to current user before removing mapping dir
+  docker run --rm -i -v "`pwd`/$OUT:`pwd`/$OUT" -w `pwd` bdgenomics/rhapsody:1.10.1 \
+    chown "$(id -u):$(id -g)" --silent --recursive "$mapping_dir/"
+
+  echo "samtools"
+  samtools view -F 260 "$mapping_dir/Aligned.out.sam" > "$mapping_dir/primary_aligned_reads.sam"
+  echo "cut"
+  cut -f 1 "$mapping_dir/primary_aligned_reads.sam" | sort | uniq > "$mapping_dir/mapped_reads.txt"
+  head -500000 "$mapping_dir/mapped_reads.txt" > "$mapping_dir/mapped_reads_subset.txt"
+  echo "seqkit"
+  seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R1_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq"
+  seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq"
+
+  # rm -r "$mapping_dir"
+  # rm -r "$genome_dir"
+fi
+
+# subsample other files
+smk_r1_file="$raw_dir/12SMK_S1_L432_R1_001_subset.fastq.gz"
+if [[ ! -f "$smk_r1_file" ]]; then
+  echo "> Processing `basename $smk_r1_file`"
+  seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R1_001.fastq.gz" | gzip > "$smk_r1_file"
+fi
+smk_r2_file="$raw_dir/12SMK_S1_L432_R2_001_subset.fastq.gz"
+if [[ ! -f "$smk_r2_file" ]]; then
+  echo "> Processing `basename $smk_r2_file`"
+  seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R2_001.fastq.gz" | gzip > "$smk_r2_file"
+fi
+abc_r1_file="$raw_dir/12ABC_S1_L432_R1_001_subset.fastq.gz"
+if [[ ! -f "$abc_r1_file" ]]; then
+  echo "> Processing `basename $abc_r1_file`"
+  seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R1_001.fastq.gz" | gzip > "$abc_r1_file"
+fi
+abc_r2_file="$raw_dir/12ABC_S1_L432_R2_001_subset.fastq.gz"
+if [[ ! -f "$abc_r2_file" ]]; then
+  echo "> Processing `basename $abc_r2_file`"
+  seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R2_001.fastq.gz" | gzip > "$abc_r2_file"
+fi
+wta_r1_file="$raw_dir/12WTA_S1_L432_R1_001_subset.fastq.gz"
+if [[ ! -f "$wta_r1_file" ]]; then
+  echo "> Processing `basename $wta_r1_file`"
+  gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" > "$wta_r1_file"
+fi
+wta_r2_file="$raw_dir/12WTA_S1_L432_R2_001_subset.fastq.gz"
+if [[ ! -f "$wta_r2_file" ]]; then
+  echo "> Processing `basename $wta_r2_file`"
+  gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq" > "$wta_r2_file"
+fi
+# copy immune panel fasta
+fasta_file="$raw_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta"
+if [[ ! -f "$fasta_file" ]]; then
+  cp "$tar_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta" "$fasta_file"
+fi
+
+genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz"
+
+nextflow run . \
+  -main-script target/nextflow/workflows/ingestion/bd_rhapsody/main.nf  \
+  -resume \
+  -profile docker,mount_temp \
+  -c src/workflows/utils/labels_ci.config \
+  -c src/workflows/utils/errorstrat_ignore.config \
+  --reads "$wta_r1_file;$wta_r2_file;$abc_r1_file;$abc_r2_file;$smk_r1_file;$smk_r2_file" \
+  --reference_archive "$genome_tar" \
+  --abseq_reference "$fasta_file" \
+  --sample_tags_version "hs" \
+  --tag_names "1-Jurkat;2-Ramos;3-THP1" \
+  --output_raw "output_raw" \
+  --output "output.h5mu" \
+  --output_state state.yaml \
+  --cell_calling_data "mRNA" \
+  --exact_cell_count 4000 \
+  --generate_bam true \
+  --publish_dir "$OUT/processed"
--- a/resources_test_scripts/bdrhap_vdj.sh
+++ b/resources_test_scripts/bdrhap_vdj.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# TODO: we should turn this into viash components
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=bdrhap_vdj
+OUT=resources_test/$ID
+n_threads=30
+
+# create raw directory
+raw_dir="$OUT/raw"
+mkdir -p "$raw_dir"
+
+# Check whether seqkit is available
+if ! command -v seqkit &> /dev/null; then
+    echo "This script requires seqkit. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
+# download and untar source fastq files
+tar_dir="$HOME/.cache/openpipeline/VDJDemo"
+if [[ ! -d "$tar_dir" ]]; then
+    mkdir -p "$tar_dir"
+    wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/VDJDemo/VDJDemo.tar" -O "$tar_dir.tar"
+    tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
+    rm "$tar_dir.tar"
+fi
+
+# subset fastq files
+for sample_id in RhapVDJDemo-BCR_S1_L001_R1_001 RhapVDJDemo-BCR_S1_L001_R2_001 RhapVDJDemo-mRNA_S5_L001_R1_001 RhapVDJDemo-mRNA_S5_L001_R2_001 RhapVDJDemo-TCR_S3_L001_R1_001 RhapVDJDemo-TCR_S3_L001_R2_001; do
+  subset_file="$raw_dir/${sample_id}_subset.fastq.gz"
+  if [[ ! -f "$subset_file" ]]; then
+  echo "> Processing $sample_id"
+    seqkit head -n 300000 "$tar_dir/$sample_id.fastq.gz" | gzip > "$subset_file"
+  fi
+  unset subset_file
+done
+
+# copy immune panel fasta
+fasta_file="$raw_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta"
+if [[ ! -f "$fasta_file" ]]; then
+  cp "$tar_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta" "$fasta_file"
+fi
+
+# create params file
+cat > /tmp/params.yaml << HERE
+param_list:
+- id: "targeted_vdj"
+  input: "$raw_dir/RhapVDJDemo-*_S*_L001_R[12]_001_subset.fastq.gz"
+mode: targeted
+reference: "$fasta_file"
+publish_dir: "$OUT/processed"
+putative_cell_call: "mRNA"
+vdj_version: human
+HERE
+
+# run bd rhapsody pipeline
+nextflow \
+  run . \
+  -main-script src/workflows/ingestion/bd_rhapsody/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -with-trace work/trace.txt \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config \
+  -c src/workflows/utils/errorstrat_ignore.config
--- a/resources_test_scripts/cellranger_atac_tiny_bcl.sh
+++ b/resources_test_scripts/cellranger_atac_tiny_bcl.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# settings
+ID=cellranger_atac_tiny_bcl
+OUT="resources_test/$ID/"
+DIR="$OUT"
+REFERENCE_DIR=resources_test/reference_gencodev41_chr1
+
+# create tempdir
+MY_TEMP="${VIASH_TEMP:-/tmp}"
+TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+trap clean_up EXIT
+
+viash ns build -q "download_file|cellranger_atac_mkfastq|build_cellranger_arc_reference|cellranger_atac_count" -p docker --setup cb
+
+
+# download bcl data
+if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
+  mkdir -p "$OUT/bcl"
+
+  # download tar gz
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-1.0.0.tar.gz \
+    --output "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz"
+  
+  # untar
+  
+  tar -xf "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz" \
+    --strip-components=1 \
+    -C "$OUT/bcl"
+
+  # remove tar
+  rm "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz"
+
+  # Download the layout file. It contains info about the samples (1 in this case) and lanes
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-simple-1.0.0.csv \
+    --output "${OUT}/bcl/layout.csv"
+
+  # download sample sheet
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-samplesheet-1.0.0.csv \
+    --output "${OUT}/bcl/sample_sheet.csv"
+fi
+
+if [ ! -d "${OUT}/fastqs" ]; then
+  mkdir -p "$OUT/fastqs"
+
+  target/docker/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq \
+    --input "${OUT}/bcl" \
+    --csv "${OUT}/bcl/layout.csv" \
+    --output "${OUT}/fastqs"
+fi
+
+# Create count matrices
+if [ ! -d "${OUT}/counts" ]; then
+  mkdir -p "$OUT/counts"
+  
+  target/docker/mapping/cellranger_atac_count/cellranger_atac_count \
+    --input "${OUT}/fastqs/HJN3KBCX2/test_sample/" \
+    --reference "${REFERENCE_DIR}/reference_cellranger.tar.gz" \
+    --output "${OUT}/counts"
+fi
--- a/resources_test_scripts/cellranger_tiny_bcl.sh
+++ b/resources_test_scripts/cellranger_tiny_bcl.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# settings
+ID=cellranger_tiny_bcl
+OUT="resources_test/$ID/"
+DIR="$OUT"
+
+# create tempdir
+MY_TEMP="${VIASH_TEMP:-/tmp}"
+TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+trap clean_up EXIT
+
+# download bcl data
+if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
+  mkdir -p "$OUT/bcl"
+
+  # download tar gz
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-1.2.0.tar.gz \
+    --output "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz"
+  
+  # untar
+  tar -xf "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz" \
+    --strip-components=1 \
+    -C "$OUT/bcl"
+
+  # remove tar
+  rm "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz"
+
+  # download sample sheet
+  target/docker/download/download_file/download_file \
+    --input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-simple-1.2.0.csv \
+    --output "${OUT}/bcl/sample_sheet.csv"
+fi
+
+if [ ! -d "${OUT}/fastqs" ]; then
+  mkdir -p "$OUT/fastqs"
+
+  target/docker/demux/cellranger_mkfastq/cellranger_mkfastq \
+    --input "${OUT}/bcl" \
+    --sample_sheet "${OUT}/bcl/sample_sheet.csv" \
+    --output "${OUT}/fastqs"
+fi
+
+# bcl-convert requires a v2 sample sheet
+# bcl-convert is a bit more strict concerning filter files being present or not.
+# We make a copy and make the necessary adaptations. 
+
+# We are using the tiny bcl dataset provided by Illumina:
+#   https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq
+# Unfortunately,
+#   1. the sample sheet delivered with it does not work with bcl-convert (v1 of the format)
+#   2. 2 filter files are missing from the run directory that bcl-convert requires to run
+#
+# We worked around this by
+#   1. Manually editing a sample sheet file suited for bcl-convert (format v2)
+#   2. Adding a filter file
+#
+# The filter file is a binary file, we just created an empty file use that.
+# bcl-convert might complain about it, but at least something is written out.
+# An alternative is to use a filter file from a different project. This also generates
+# a warning, but the fastq ouput files contain reads. The drawback is that those filter files
+# are generally above 100MB in size.
+#
+# TODO: Check if a (binary) filter file can be generated that is small but works.
+
+if [ ! -f "${OUT}/bcl2/sample_sheet.csv" ]; then
+  mkdir "${OUT}/bcl2/"
+  cp -r ${OUT}/bcl/* "${OUT}/bcl2/"
+  cat > "${OUT}/bcl2/sample_sheet.csv" << HERE
+[Header],,,,,,,,,
+FileFormatVersion,2,,,,,,
+RunName,hiseq_test,,,,,,
+InstrumentPlatform,NextSeq,,,,,,
+IndexOrientation,Forward,,,,,,
+,,,,,,,,,
+[Reads],,,,,,,,,
+Read1Cycles,26,,,,,,,,,
+Read2Cycles,98,,,,,,
+,,,,,,,,,
+[Sequencing_Settings],,,,,,,
+,,,,,,,
+[BCLConvert_Settings],,,,,,,
+SoftwareVersion,3.8.4,,,,,,
+NoLaneSplitting,true,,,,,,
+FastqCompressionFormat,gzip,,,,,,
+,,,,,,,,,
+[BCLConvert_Data],,,,,,,
+Sample_ID,index,,,,,,
+s1,GGTTTACT,,,,,,
+,,,,,,,
+[Cloud_Settings],,,,,,,
+GeneratedVersion,1.3.0.202111171923,,,,,,
+,,,,,,,
+[Cloud_Data],,,,,,,
+Sample_ID,ProjectName,LibraryName,LibraryPrepKitName,IndexAdapterKitName,I7_Index_ID,Sample_Name,Description,Instrument,Type
+s1,p1,s1_SI-P03-C9,,,IDT01,SI-P03-C9,s1,NextSeq,HighOutput_75cycles
+HERE
+  
+  touch "${OUT}/bcl2/Data/Intensities/BaseCalls/L001/s_1_1101.filter"
+fi
--- a/resources_test_scripts/cellranger_tiny_fastq.sh
+++ b/resources_test_scripts/cellranger_tiny_fastq.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# settings
+ID=cellranger_tiny_fastq
+OUT="resources_test/$ID/"
+DIR="$OUT"
+
+# download cellranger tar gz
+cellranger_tar_gz="${OUT}/temp_cellranger-6.1.2.tar.gz"
+if [ ! -f "$cellranger_tar_gz" ]; then
+  echo "Download Cell Ranger 6.1.2 manually first!"
+  exit 1
+fi
+
+# untar fastqs
+cellranger_tiny_fastq="${OUT}/cellranger_tiny_fastq"
+if [ ! -f "${cellranger_tiny_fastq}/tinygex_S1_L001_R1_001.fastq.gz" ]; then
+  mkdir -p "$cellranger_tiny_fastq"
+  
+  tar -xzf "$cellranger_tar_gz" \
+    -C "$cellranger_tiny_fastq" \
+    "cellranger-6.1.2/external/cellranger_tiny_fastq" \
+    --strip-components=3
+fi
+
+# untar ref
+cellranger_tiny_ref="${OUT}/cellranger_tiny_ref"
+if [ ! -f "${cellranger_tiny_ref}/reference.json" ]; then
+  mkdir -p "$cellranger_tiny_ref"
+  
+  tar -xzf "$cellranger_tar_gz" \
+    -C "$cellranger_tiny_ref" \
+    "cellranger-6.1.2/external/cellranger_tiny_ref" \
+    --strip-components=3
+fi
+
+# Create ref with more recent STAR version
+recent_ref_dir="${OUT}/cellranger_tiny_ref_v2_7_10_a"
+if [ ! -f "${recent_ref_dir}/Genome" ]; then
+  mkdir -p "${recent_ref_dir}"
+
+  target/docker/mapping/star_build_reference/star_build_reference \
+    --genome_fasta "$cellranger_tiny_ref/fasta/genome.fa" \
+    --output "$recent_ref_dir" \
+    --genomeSAindexNbases 7 \
+    --transcriptome_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz"
+fi
+
+# run cellranger count
+bam_dir="${OUT}/bam"
+if [ ! -f "$bam_dir/possorted_genome_bam.bam" ]; then
+  mkdir -p "$bam_dir"
+
+  viash run src/mapping/cellranger_count/config.vsh.yaml -- \
+    --input "$cellranger_tiny_fastq" \
+    --reference "$cellranger_tiny_ref" \
+    --output "$bam_dir"
+fi
+
+# convert to h5mu
+raw_h5mu="${OUT}/raw_dataset.h5mu"
+if [ ! -f "$step1_h5mu" ]; then
+  viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
+    --input "${bam_dir}/raw_feature_bc_matrix.h5" \
+    --output "$raw_h5mu"
+fi
+
+# run velocyto
+velo_gtf="$cellranger_tiny_ref/genes/genes.gtf.gz"
+velo_bam="$bam_dir/possorted_genome_bam.bam"
+velo_loom="${OUT}/velocyto.loom"
+if [ ! -f "$velo_loom" ]; then
+  viash run src/velocity/velocyto/config.vsh.yaml -- \
+    --input "$velo_bam" \
+    --output "$velo_loom" \
+    --transcriptome "$velo_gtf"
+fi
+
+# combine raw counts with velocyto data
+dataset_h5mu="${OUT}/dataset.h5mu"
+if [ ! -f "$dataset_h5mu" ]; then
+  viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \
+    --input_loom "$velo_loom" \
+    --input_h5mu "$raw_h5mu" \
+    --output "$dataset_h5mu"
+fi
+
+# run htseq
+htseq_counts="${OUT}/htseq_counts.tsv"
+if [ ! -f "$htseq_counts" ]; then
+  viash run src/mapping/htseq_count/config.vsh.yaml -- \
+  --input "$velo_bam" \
+  --reference "$velo_gtf" \
+  --output "$htseq_counts"
+fi
+
+multi_star="${OUT}/multi_star"
+if [ ! -d "$multi_star" ]; then
+  viash run src/mapping/multi_star/config.vsh.yaml -- \
+    --input_id "tinygex" \
+    --input_r1 "$cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz" \
+    --input_r2 "$cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz" \
+    --input_id "tinygex" \
+    --input_r1 "$cellranger_tiny_fastq/tinygex_S1_L002_R1_001.fastq.gz" \
+    --input_r2 "$cellranger_tiny_fastq/tinygex_S1_L002_R2_001.fastq.gz" \
+    --reference_index "$recent_ref_dir" \
+    --reference_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz" \
+    --output "$multi_star" \
+    ---cpus 30
+fi
--- a/resources_test_scripts/concat_test_data.sh
+++ b/resources_test_scripts/concat_test_data.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# The output folder
+OUT="resources_test/concat_test_data/"
+
+# create it if it doesn't exist already
+[ -d "$OUT" ] || mkdir -p "$OUT"
+
+echo "> Downloading files"
+target/docker/download/download_file/download_file \
+  --input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/e18_mouse_brain_fresh_5k/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5 \
+  --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5"
+
+target/docker/download/download_file/download_file \
+  --input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/human_brain_3k/human_brain_3k_filtered_feature_bc_matrix.h5 \
+  --output "${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5"
+
+echo "> Converting to h5mu"
+viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
+  --input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \
+  --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu"
+
+viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
+  --input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5" \
+  --output "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu"
+
+echo "> Subsetting datasets"
+viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
+  --input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu" \
+  --output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
+  --number_of_observations 2000
+
+viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
+  --input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \
+  --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
+  --number_of_observations 2000
+
+echo "Making observation ids unique (required for concat component to function)"
+viash run src/metadata/add_id/config.vsh.yaml -- \
+--input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
+--output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
+--input_id "human" \
+--make_observation_keys_unique
+
+viash run src/metadata/add_id/config.vsh.yaml -- \
+--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
+--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
+--input_id "mouse" \  
+--make_observation_keys_unique
+
+echo "Removing temp files"
+rm "${OUT}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \
+   "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \
+   "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
+   "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
+   "${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5mu" \
+   "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5"
+   
+
+echo "> Running concat component"
+viash run src/dataflow/concat/config.vsh.yaml -- \
+  --input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu,$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
+  --input_id "human,mouse" \
+  --output "$OUT/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"
--- a/resources_test_scripts/demuxafy_test_data.sh
+++ b/resources_test_scripts/demuxafy_test_data.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -eo pipefail
+
+
+# settings
+ID=demuxafy_test_data
+OUT=resources_test/$ID
+DIR="$OUT"
+
+mkdir -p "$OUT"
+cd "$OUT"
+# download demuxafy test dataset
+wget https://www.dropbox.com/s/m8u61jn4i1mcktp/TestData4PipelineSmall.tar.gz
+tar -xf TestData4PipelineSmall.tar.gz
+# bam and vcf file
+cp TestData4PipelineSmall/test_dataset.vcf .
+# extract chr from vcf file
+grep -w '^#\|^#CHROM\|^[1-2]' test_dataset.vcf > test_dataset_chr1_2.vcf
+grep -w '^#\|^#CHROM\|^[3-4]' test_dataset.vcf > test_dataset_chr3_4.vcf
+
+# barcode list
+cp TestData4PipelineSmall/test_dataset/outs/filtered_gene_bc_matrices/Homo_sapiens_GRCh38p10/barcodes.tsv .
+
+# subsetted bam and bai for souporcell
+wget https://www.dropbox.com/s/7ew5lt0msf4z5gj/chr_1_pooled.sorted.bam
+wget https://www.dropbox.com/s/tpplbj9sab9b2p4/chr_1_pooled.sorted.bam.bai
+
+# variants from mixed sample
+wget https://www.dropbox.com/s/btir7ge4kzc7tu1/mixed_variant.vcf
+
+# dsc_pileup output
+wget https://www.dropbox.com/s/17hj9i0yavtezx1/dsc_pileup.zip
+unzip dsc_pileup.zip
+
+# subsetted human genome reference
+wget https://www.dropbox.com/s/ynlce3g7nwxthwg/genome_chr1.fa
+
+# remove unnecessary files
+rm -rf TestData4PipelineSmall
+rm TestData4PipelineSmall.tar.gz
+rm dsc_pileup.zip
--- a/resources_test_scripts/hlca_reference_model.sh
+++ b/resources_test_scripts/hlca_reference_model.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+ID=HLCA_reference_model
+OUT=resources_test/$ID/$ID
+DIR=$(dirname "$OUT")
+
+# ideally, this would be a versioned pipeline run
+[ -d "$DIR" ] || mkdir -p "$DIR"
+
+# download and unarchive pre-trained scANVI model
+wget https://zenodo.org/record/6337966/files/HLCA_reference_model.zip \
+  -O "${OUT}.zip"
+
+# # Test query data
+# # Source publication: Delorey, Toni M., et al. “COVID-19 tissue atlases reveal SARS-CoV-2 pathology and cellular targets.” Nature 595.7865 (2021): 107-113.
+# wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5230nnn/GSM5230027/suppl/GSM5230027_04-P103142-S149-R01_raw_feature_bc_matrix.h5.gz \
+#   -O "${OUT}_query_test.h5.gz"
+# gzip -d "${OUT}_query_test.h5.gz"
+
+# # Prepare test data as in scvi-tools tutorial: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/query_hlca_knn.html
+# python <<HEREDOC
+# import pandas as pd
+# import scanpy as sc
+
+# geo_metadata_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE171nnn/GSE171668/suppl/GSE171668_lung_metadata.csv.gz"
+# metadata = pd.read_csv(geo_metadata_url, index_col=0)
+
+# DATA_PATH = "${OUT}_query_test.h5"
+# query_data = sc.read_10x_h5(DATA_PATH)
+# # clean up .var.index (gene names)
+# query_data.var['gene_names'] = query_data.var.index
+# query_data.var.index = [idx.split("___")[-1] for idx in query_data.var.gene_ids]
+# # clean up cell barcodes:
+# query_data.obs.index = query_data.obs.index.str.rstrip("-1")
+# # read in metadata (to select only cells of interest and remove empty drops)
+# # subset to cells from our sample
+# metadata = metadata.loc[metadata.donor == "D12_4",:].copy()
+# # clean up barcodes:
+# metadata.index = [idx.split("-")[-1] for idx in metadata.index]
+# # subset adata to cells in metadata:
+# query_data = query_data[metadata.index,:].copy()
+# # add dataset information:
+# query_data.obs['dataset'] = "test_dataset_delorey_regev"
+# sc.write(DATA_PATH, query_data)
+# HEREDOC
+
+# # convert 10x h5 to h5mu
+# viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml -- \
+#   --input "${OUT}_query_test.h5" \
+#   --output "${OUT}_query_test.h5mu"
--- a/resources_test_scripts/merge_test_data.sh
+++ b/resources_test_scripts/merge_test_data.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+
+
+# settings
+ID=merge_test_data
+OUT=resources_test/$ID
+DIR="$OUT"
+
+mkdir -p "$OUT"
+
+target/docker/dataflow/split_modalities/split_modalities \
+  --input resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu \
+  --output "$OUT"
+
--- a/resources_test_scripts/pbmc_1k_protein_v3.sh
+++ b/resources_test_scripts/pbmc_1k_protein_v3.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+ID=pbmc_1k_protein_v3
+OUT=resources_test/$ID/$ID
+DIR=$(dirname "$OUT")
+
+# ideally, this would be a versioned pipeline run
+[ -d "$DIR" ] || mkdir -p "$DIR"
+
+# dataset page:
+# https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-gene-expression-and-cell-surface-protein-3-standard-3-0-0
+
+# download metrics summary
+wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_metrics_summary.csv \
+  -O "${OUT}_metrics_summary.csv"
+
+# download counts h5 file
+wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5 \
+  -O "${OUT}_filtered_feature_bc_matrix.h5"
+
+wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \
+  -O "${OUT}_raw_feature_bc_matrix.h5"
+
+# download counts matrix tar gz file
+wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.tar.gz \
+  -O "${OUT}_filtered_feature_bc_matrix.tar.gz"
+
+# extract matrix tar gz
+mkdir -p "${OUT}_filtered_feature_bc_matrix"
+tar -xvf "${OUT}_filtered_feature_bc_matrix.tar.gz" \
+  -C "${OUT}_filtered_feature_bc_matrix" \
+  --strip-components 1
+rm "${OUT}_filtered_feature_bc_matrix.tar.gz"
+
+# convert 10x h5 to h5mu
+target/docker/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu \
+  --input "${OUT}_filtered_feature_bc_matrix.h5" \
+  --input_metrics_summary "${OUT}_metrics_summary.csv" \
+  --output "${OUT}_filtered_feature_bc_matrix.h5mu"
+
+# run single sample
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/rna/rna_singlesample/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_uss \
+  --input "${OUT}_filtered_feature_bc_matrix.h5mu" \
+  --output "`basename $OUT`_uss.h5mu" \
+  --publishDir `dirname $OUT` \
+  -resume
+
+# add the sample ID to the mudata object
+nextflow \
+  run . \
+  -main-script target/nextflow/metadata/add_id/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_uss \
+  --input "${OUT}_uss.h5mu" \
+  --input_id "pbmc_1k_protein_v3_uss" \
+  --output "`basename $OUT`_uss_with_id.h5mu" \
+  --output_compression "gzip" \
+  --publishDir `dirname $OUT` \
+  -resume
+
+# run multisample
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/rna/rna_multisample/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_ums \
+  --input "${OUT}_uss_with_id.h5mu" \
+  --output "`basename $OUT`_ums.h5mu" \
+  --publishDir `dirname $OUT` \
+  -resume
+
+rm "${OUT}_uss_with_id.h5mu"
+
+# run dimred
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_mms \
+  --input "${OUT}_ums.h5mu" \
+  --output "`basename $OUT`_mms.h5mu" \
+  --publishDir `dirname $OUT` \
+  --obs_covariates sample_id \
+  -resume
+
+# run integration
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/integration/harmony_leiden/main.nf \
+  -c src/workflows/utils/labels_ci.config \
+  -profile docker \
+  --id pbmc_1k_protein_v3_mms_integration \
+  --input "${OUT}_mms.h5mu" \
+  --output "`basename $OUT`_mms.h5mu" \
+  --publishDir `dirname $OUT` \
+  --obs_covariates sample_id \
+  -resume
+
+python <<HEREDOC
+import mudata as mu
+mudata = mu.read_h5mu("${DIR}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu")
+mudata.mod["rna"].write_h5ad("${DIR}/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad")
+HEREDOC
+
+# Convert to Seurat
+viash run src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml -- \
+  --input "${OUT}_mms.h5mu" \
+  --output "${OUT}_mms.rds" 
--- a/resources_test_scripts/ref_gencodev41_chr1.sh
+++ b/resources_test_scripts/ref_gencodev41_chr1.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=reference_gencodev41_chr1
+OUT=resources_test/$ID
+
+mkdir -p "$OUT" 
+
+wget "https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" -O "$OUT/ERCC92.zip"
+
+# Download JASPAR files for reference building
+# Source of the code below: https://support.10xgenomics.com/single-cell-atac/software/release-notes/references#GRCh38-2020-A-2.0.0
+motifs_url="https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
+motifs_in="${OUT}/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
+
+if [ ! -f "$motifs_in" ]; then
+    curl -sS "$motifs_url" > "$motifs_in"
+fi
+
+# Change motif headers so the human-readable motif name precedes the motif
+# identifier. So ">MA0004.1    Arnt" -> ">Arnt_MA0004.1".
+motifs_modified="${OUT}/$(basename "$motifs_in").modified"
+awk '{
+    if ( substr($1, 1, 1) == ">" ) {
+        print ">" $2 "_" substr($1,2)
+    } else {
+        print
+    }
+}' "$motifs_in" > "$motifs_modified"
+
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: "$ID"
+    genome_fasta: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz"
+    transcriptome_gtf: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz"
+    target: ["bd_rhapsody", "cellranger_arc"] 
+    output_fasta: "reference.fa.gz"
+    output_gtf: "reference.gtf.gz"
+    non_nuclear_contigs: null
+    output_cellranger_arc: "reference_cellranger.tar.gz"
+    output_bd_rhapsody: "reference_bd_rhapsody.tar.gz"
+    bdrhap_extra_star_params: "--genomeSAindexNbases 12 --genomeSAsparseD 2"
+    motifs_file: "$motifs_modified"
+    subset_regex: "chr1"
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
+  -profile docker \
+  -c ./src/workflows/utils/labels_ci.config \
+  -params-file /tmp/params.yaml \
+  --publish_dir $OUT \
+  -resume
--- a/resources_test_scripts/remote_param_list.sh
+++ b/resources_test_scripts/remote_param_list.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+mkdir -p "resources_test/remote_param_list/"
+OUT=resources_test/remote_param_list/test_param_list.yaml
+OUT_CSV=resources_test/remote_param_list/test_param_list.csv
+OUT_JSON=resources_test/remote_param_list/test_param_list.json
+
+cat > $OUT << HERE
+- id: "mouse"
+  input: s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
+  publish_dir: "foo_remote/"
+  rna_min_counts: 2
+  prot_min_counts: 3
+- id: "human"
+  input: s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
+  publish_dir: "foo_remote/"
+  rna_min_counts: 2
+  prot_min_counts: 3
+HERE
+
+cat > $OUT_CSV << EOF
+"id","input","publish_dir","rna_min_counts","prot_min_counts"
+"mouse","s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3"
+"human","s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3"
+EOF
+
+cat > $OUT_JSON << HERE
+[
+    {
+        "id": "mouse",
+        "input": "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu",
+        "publish_dir": "foo_remote/",
+        "rna_min_counts": 2,
+        "prot_min_counts": 3
+    },
+    {
+        "id": "human",
+        "input": "s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu",
+        "publish_dir": "foo_remote/",
+        "rna_min_counts": 2,
+        "prot_min_counts": 3
+    }
+]
+HERE
--- a/resources_test_scripts/rna_velocity.sh
+++ b/resources_test_scripts/rna_velocity.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=rna_velocity
+OUT=resources_test/$ID
+
+
+# create raw directory
+velocyto_dir="$OUT/velocyto"
+mkdir -p "$velocyto_dir"
+
+########################################################
+# Create a compatible BAM file from BD Rhapsody Output #
+########################################################
+
+bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/output_raw/Combined_sample_Bioproduct.bam"
+
+if [[ ! -f "$bd_rhap_wta_bam" ]]; then
+    echo "$bd_rhap_wta_bam does not exist. Please generate BD Rhapsody test data first."
+    exit 1
+fi
+
+echo "> Converting BD Rhapsody barcode tags."
+viash run src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml -- \
+  -i "$bd_rhap_wta_bam" \
+  -o "$velocyto_dir/compatible_bd_input.bam" \
+  --bam \
+  -t 4
+
+echo "> Creating barcodes file."
+samtools view -@4 "$velocyto_dir/compatible_bd_input.bam" | \
+  grep -oP "(?<=CB:Z:)\S+" | sort | uniq | head > "$velocyto_dir/barcodes.txt"
+
+###########################################################
+# Process Tiny Fast Fastq dataset from 10X to create      #
+# input data for convert/from_velocyto_to_h5mu compontent #
+###########################################################
+
+mkdir "$OUT/velocyto_processed"
+
+gtf="resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz"
+bam="resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam"
+
+echo "> Processing 10x dataset"
+viash run src/velocity/velocyto/config.vsh.yaml -- \
+  -i "$bam" \
+  -o "$OUT/velocyto_processed/cellranger_tiny.loom" \
+  --transcriptome "$gtf"
+
+echo "> Converting loom file to MuData object"
+viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \
+  --input_loom "$OUT/velocyto_processed/cellranger_tiny.loom" \
+  --input_h5mu "resources_test/cellranger_tiny_fastq/raw_dataset.h5mu" \
+  --modality velocyto \
+  --output_compression "gzip" \
+  --output "$OUT/velocyto_processed/velocyto.h5mu"
--- a/resources_test_scripts/tiledb.sh
+++ b/resources_test_scripts/tiledb.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# ensure that the command below is run from the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+cd "$REPO_ROOT"
+
+# settings
+ID=tiledb
+OUT="resources_test/$ID"
+
+# create raw directory
+raw_dir="$OUT/"
+mkdir -p "$raw_dir"
+
+
+viash run src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml -- \
+    --input resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu \
+    --tiledb_dir "$OUT"/pbmc_1k_protein_v3_mms \
+    --rna_modality "rna" \
+    --rna_raw_layer_input "X" \
+    --rna_normalized_layer_input "log_normalized" \
+    --rna_var_gene_names_input "gene_symbol"
--- a/resources_test_scripts/vireo_test_data.sh
+++ b/resources_test_scripts/vireo_test_data.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -eo pipefail
+
+
+# settings
+ID=vireo_test_data
+OUT=resources_test/$ID
+DIR="$OUT"
+
+mkdir -p "$OUT"
+cd "$OUT"
+# download vireo tutorial dataset
+wget https://github.com/single-cell-genetics/vireo/raw/master/data/cells.cellSNP.vcf.gz
--- a/ruff.toml
+++ b/ruff.toml
@@ -0,0 +1,43 @@
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".git",
+    ".pyenv",
+    ".pytest_cache",
+    ".ruff_cache",
+    ".venv",
+    ".vscode",
+    "__pypackages__",
+    "_build",
+    "build",
+    "dist",
+    "node_modules",
+    "site-packages",
+]
+
+builtins = ["meta"]
+
+
+
+
+[format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
+
+[lint.flake8-pytest-style]
+fixture-parentheses = false
+mark-parentheses = false
+
+[lint]
+ignore = [
+  # module level import not at top of file
+  "E402"
+]
--- a/src/annotate/celltypist/config.vsh.yaml
+++ b/src/annotate/celltypist/config.vsh.yaml
@@ -0,0 +1,165 @@
+name: celltypist
+namespace: annotate
+scope: "public"
+description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        default: log_normalized
+        description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used. 
+      - name: "--input_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--input_reference_gene_overlap"
+        type: integer
+        default: 100
+        min: 1
+        description: | 
+          The minimum number of genes present in both the reference and query datasets.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
+
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description: The name of the adata obs column in the reference data containing cell type annotations.
+        default: "cell_ontology_class"
+      - name: "--reference_var_input"
+        type: string
+        default: "filter_with_hvg"
+        required: false
+        description: |
+          .var column containing highly variable genes. If not provided, genes will not be subset.
+      - name: "--reference_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
+
+  - name: Model arguments
+    description: Model arguments.
+    arguments:
+      - name: "--model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+      - name: "--feature_selection"
+        type: boolean
+        description: "Whether to perform feature selection."
+        default: false
+      - name: "--majority_voting"
+        type: boolean
+        description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
+        default: false
+      - name: "--C"
+        type: double
+        description: "Inverse of regularization strength in logistic regression."
+        default: 1.0
+      - name: "--max_iter"
+        type: integer
+        description: "Maximum number of iterations before reaching the minimum of the cost function."
+        default: 1000
+      - name: "--use_SGD"
+        type: boolean_true
+        description: "Whether to use the stochastic gradient descent algorithm."
+      - name: "--min_prop"
+        type: double
+        description: |
+          "For the dominant cell type within a subcluster, the minimum proportion of cells required to 
+          support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. 
+          Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
+        default: 0
+
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_obs_predictions"
+        type: string
+        default: celltypist_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: celltypist_probability
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  - path: /src/utils/cross_check_genes.py
+  - path: /src/utils/subset_vars.py
+  - path: /src/utils/set_var_index.py
+  - path: /src/utils/is_lognormalized.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+  - type: docker
+    image: nvcr.io/nvidia/pytorch:25.11-py3
+    setup:
+      - type: python
+        packages:
+          - celltypist==1.7.1
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/scanpy.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highcpu, highmem, highdisk, gpu]
--- a/src/annotate/celltypist/script.py
+++ b/src/annotate/celltypist/script.py
@@ -0,0 +1,166 @@
+import sys
+import celltypist
+import mudata as mu
+import anndata as ad
+import pandas as pd
+from torch.cuda import is_available as cuda_is_available
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu",
+    "output": "output.h5mu",
+    "modality": "rna",
+    # "reference": None,
+    "reference_var_input": None,
+    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    "model": None,
+    # "model": "resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl",
+    "input_layer": None,
+    "reference_layer": "log_normalized",
+    "input_reference_gene_overlap": 100,
+    "reference_obs_target": "cell_ontology_class",
+    "feature_selection": True,
+    "majority_voting": True,
+    "output_compression": "gzip",
+    "input_var_gene_names": "dup_idx",
+    "reference_var_gene_names": "ensemblid",
+    "C": 1.0,
+    "max_iter": 1000,
+    "use_SGD": False,
+    "min_prop": 0,
+    "output_obs_predictions": "celltypist_pred",
+    "output_obs_probability": "celltypist_probability",
+    "sanitize_ensembl_ids": False,
+}
+meta = {"resources_dir": "src/utils"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+from cross_check_genes import cross_check_genes
+from set_var_index import set_var_index
+from subset_vars import subset_vars
+from is_lognormalized import is_lognormalized
+
+logger = setup_logger()
+use_gpu = cuda_is_available()
+logger.info("GPU enabled? %s", use_gpu)
+
+
+def main(par):
+    if (not par["model"] and not par["reference"]) or (
+        par["model"] and par["reference"]
+    ):
+        raise ValueError(
+            "Make sure to provide either 'model' or 'reference', but not both."
+        )
+
+    input_mudata = mu.read_h5mu(par["input"])
+    input_adata = input_mudata.mod[par["modality"]]
+    input_modality = input_adata.copy()
+
+    # Provide correct format of query data for celltypist annotation
+    ## Sanitize gene names and set as index
+    input_modality = set_var_index(
+        input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
+
+    ## Fetch lognormalized counts
+    lognorm_counts = (
+        input_modality.layers[par["input_layer"]].copy()
+        if par["input_layer"]
+        else input_modality.X.copy()
+    )
+
+    if not is_lognormalized(lognorm_counts, target_sum=10000):
+        raise ValueError(
+            "Invalid expression matrix, expect input layer to contain log1p normalized expression to 10000 counts per cell."
+        )
+
+    ## Create AnnData object
+    input_modality = ad.AnnData(
+        X=lognorm_counts, var=pd.DataFrame(index=input_modality.var.index)
+    )
+
+    if par["model"]:
+        logger.info("Loading CellTypist model")
+        model = celltypist.models.Model.load(par["model"])
+        cross_check_genes(
+            input_modality.var.index,
+            model.features,
+            min_gene_overlap=par["input_reference_gene_overlap"],
+        )
+
+    elif par["reference"]:
+        reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]]
+
+        # Check expression before subsetting to HVG
+        if not is_lognormalized(lognorm_counts, target_sum=10000):
+            raise ValueError(
+                "Invalid expression matrix, expect reference layer to contain log1p normalized expression to 10000 counts per cell."
+            )
+
+        # subset to HVG if required
+        if par["reference_var_input"]:
+            reference_modality = subset_vars(
+                reference_modality, par["reference_var_input"]
+            )
+
+        # Set var names to the desired gene name format (gene symbol, ensembl id, etc.)
+        # CellTypist requires query gene names to be in index
+        reference_modality = set_var_index(
+            reference_modality,
+            par["reference_var_gene_names"],
+            par["sanitize_ensembl_ids"],
+        )
+
+        # Ensure enough overlap between genes in query and reference
+        cross_check_genes(
+            input_modality.var.index,
+            reference_modality.var.index,
+            min_gene_overlap=par["input_reference_gene_overlap"],
+        )
+
+        reference_matrix = (
+            reference_modality.layers[par["reference_layer"]]
+            if par["reference_layer"]
+            else reference_modality.X
+        )
+
+        labels = reference_modality.obs[par["reference_obs_target"]]
+
+        logger.info("Training CellTypist model on reference")
+        model = celltypist.train(
+            reference_matrix,
+            labels=labels,
+            genes=reference_modality.var.index.astype(
+                str
+            ),  # ensure string type, fails if categorical when making var indices unique
+            C=par["C"],
+            max_iter=par["max_iter"],
+            use_SGD=par["use_SGD"],
+            feature_selection=par["feature_selection"],
+            check_expression=False,  # if True, throws an error if lognormalized data are subset for HVG,
+            use_GPU=use_gpu,
+        )
+
+    logger.info("Predicting CellTypist annotations")
+    # Make sure .var index is string dtype, fails if categorical when making var indices unique
+    input_modality.var.index = input_modality.var.index.astype(str)
+    predictions = celltypist.annotate(
+        input_modality, model, majority_voting=par["majority_voting"], use_GPU=use_gpu
+    )
+
+    input_adata.obs[par["output_obs_predictions"]] = predictions.predicted_labels[
+        "predicted_labels"
+    ].values
+    input_adata.obs[par["output_obs_probability"]] = predictions.probability_matrix.max(
+        axis=1
+    ).values
+
+    # copy observations back to input data (with full set of features)
+    input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
+
+
+if __name__ == "__main__":
+    main(par)
--- a/src/annotate/celltypist/test.py
+++ b/src/annotate/celltypist/test.py
@@ -0,0 +1,327 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+import scanpy as sc
+from openpipeline_testutils.asserters import assert_annotation_objects_equal
+
+## VIASH START
+meta = {
+    "executable": "./target/docker/annotate/celltypist/celltypist",
+    "resources_dir": "resources_test/",
+    "cpus": 15,
+    "memory_gb": 20,
+    "config": "src/annotate/celltypist/config.vsh.yaml",
+}
+## VIASH END
+
+
+model_file = (
+    f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl"
+)
+input_file_1 = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
+input_file_2 = f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
+
+
+def log_normalize(adata):
+    adata_norm = sc.pp.normalize_total(adata, target_sum=1e4, copy=True)
+    adata_lognorm = sc.pp.log1p(adata_norm, copy=True)
+    adata.layers["log_normalized"] = adata_lognorm.X
+    return adata
+
+
+def calculate_hvg(adata, n_top_genes=1000):
+    adata_hvg = sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, copy=True)
+    adata.var["filter_with_hvg"] = adata_hvg.var["highly_variable"]
+    return adata
+
+
+@pytest.fixture
+def reference_mdata():
+    mdata = mu.read_h5mu(reference_file)
+    adata = mdata.mod["rna"]  # already has layer "log_normalized" with 10k target sum
+    adata.var["filter_with_hvg"] = adata.var[
+        "highly_variable"
+    ]  # already has highly variable genes calculated
+    return mdata
+
+
+@pytest.fixture
+def input_mdata():
+    mdata = mu.read_h5mu(input_file_1)
+    adata = mdata.mod["rna"].copy()
+    adata.layers["counts"] = adata.X.copy()  # store raw counts in a layer
+    adata_lognorm = log_normalize(adata)
+    mdata.mod["rna"] = adata_lognorm
+    return mdata
+
+
+@pytest.fixture
+def model_input_mdata():
+    mdata = mu.read_h5mu(input_file_2)
+    adata = mdata.mod["rna"].copy()
+    adata_lognorm = log_normalize(adata)
+    mdata.mod["rna"] = adata_lognorm
+    return mdata
+
+
+def test_simple_execution(
+    run_component, random_h5mu_path, reference_mdata, input_mdata, write_mudata_to_file
+):
+    output_file = random_h5mu_path()
+    input_file = write_mudata_to_file(input_mdata)
+    reference_file = write_mudata_to_file(reference_mdata)
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert {"celltypist_pred", "celltypist_probability"}.issubset(
+        output_mudata.mod["rna"].obs.keys()
+    ), "Required keys not found in .obs"
+
+    predictions = output_mudata.mod["rna"].obs["celltypist_pred"]
+    assert predictions.dtype == "category", (
+        "Calculated predictions should be category dtype"
+    )
+    assert not all(predictions.isna()), "Not all predictions should be NA"
+
+    probabilities = output_mudata.mod["rna"].obs["celltypist_probability"]
+    assert probabilities.dtype == "float", (
+        "Calculated probabilities should be float dtype"
+    )
+    assert all(0 <= value <= 1 for value in probabilities), (
+        ".obs at celltypist_probability has values outside the range [0, 1]"
+    )
+
+
+def test_set_params(
+    run_component, random_h5mu_path, write_mudata_to_file, input_mdata, reference_mdata
+):
+    input_file = write_mudata_to_file(input_mdata)
+    reference_file = write_mudata_to_file(reference_mdata)
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--feature_selection",
+            "True",
+            "--majority_voting",
+            "True",
+            "--C",
+            "0.5",
+            "--max_iter",
+            "100",
+            "--use_SGD",
+            "--min_prop",
+            "0.1",
+            "--output",
+            output_file,
+            "--output_compression",
+            "gzip",
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert {"celltypist_pred", "celltypist_probability"}.issubset(
+        output_mudata.mod["rna"].obs.keys()
+    ), "Required keys not found in .obs"
+
+    obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        ".obs at celltypist_probability has values outside the range [0, 1]"
+    )
+
+
+def test_with_model(
+    run_component, random_h5mu_path, write_mudata_to_file, model_input_mdata
+):
+    output_file = random_h5mu_path()
+    input_file = write_mudata_to_file(model_input_mdata)
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--model",
+            model_file,
+            "--reference_layer",
+            "",
+            "--reference_obs_targets",
+            "cell_type",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert {"celltypist_pred", "celltypist_probability"}.issubset(
+        output_mudata.mod["rna"].obs.keys()
+    ), "Required keys not found in .obs"
+
+    predictions = output_mudata.mod["rna"].obs["celltypist_pred"]
+    assert predictions.dtype == "category", (
+        "Calculated predictions should be category dtype"
+    )
+    assert not all(predictions.isna()), "Not all predictions should be NA"
+
+    probabilities = output_mudata.mod["rna"].obs["celltypist_probability"]
+    assert probabilities.dtype == "float", (
+        "Calculated probabilities should be float dtype"
+    )
+    assert all(0 <= value <= 1 for value in probabilities), (
+        ".obs at celltypist_probability has values outside the range [0, 1]"
+    )
+
+
+def test_fail_invalid_input_expression(
+    run_component, random_h5mu_path, write_mudata_to_file, input_mdata, reference_mdata
+):
+    output_file = random_h5mu_path()
+    input_file = write_mudata_to_file(input_mdata)
+    reference_file = write_mudata_to_file(reference_mdata)
+
+    # fails because input data are not correctly lognormalized
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_file,
+                "--input_layer",
+                "counts",
+                "--reference",
+                reference_file,
+                "--reference_layer",
+                "log_normalized",
+                "--reference_var_gene_names",
+                "ensemblid",
+                "--output",
+                output_file,
+            ]
+        )
+    assert re.search(
+        r"Invalid expression matrix, expect input layer to contain log1p normalized expression to 10000 counts per cell",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+def test_with_hvg(
+    run_component, random_h5mu_path, write_mudata_to_file, input_mdata, reference_mdata
+):
+    output_file = random_h5mu_path()
+    input_file = write_mudata_to_file(input_mdata)
+    reference_file = write_mudata_to_file(reference_mdata)
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--reference_var_input",
+            "highly_variable",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert {"celltypist_pred", "celltypist_probability"}.issubset(
+        output_mudata.mod["rna"].obs.keys()
+    ), "Required keys not found in .obs"
+
+    obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        ".obs at celltypist_probability has values outside the range [0, 1]"
+    )
+
+
+def test_duplicate_categorical_index_entry(
+    run_component, random_h5mu_path, reference_mdata, input_mdata, write_mudata_to_file
+):
+    """Test that the component handles duplicate index entries correctly."""
+    output_file = random_h5mu_path()
+
+    # Create a modified input with duplicate index entries
+    indices = input_mdata.mod["rna"].var["gene_symbol"].to_list()
+    indices[0] = indices[1]  # duplicate the first index entry
+    import pandas as pd
+
+    input_mdata.mod["rna"].var["dup_idx"] = pd.CategoricalIndex(indices)
+
+    input_file = write_mudata_to_file(input_mdata)
+    reference_file = write_mudata_to_file(reference_mdata)
+
+    # Component should not raise with duplicate categorical indices
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "feature_name",
+            "--input_var_gene_names",
+            "dup_idx",
+            "--sanitize_ensembl_ids",
+            "False",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/consensus_vote/config.vsh.yaml
+++ b/src/annotate/consensus_vote/config.vsh.yaml
@@ -0,0 +1,111 @@
+name: consensus_vote
+namespace: annotate
+scope: "public"
+description: |
+  Combines cell type predictions from multiple annotation methods into a single consensus prediction using a weighted majority vote. 
+  For each cell, each method votes for its predicted cell type, optionally weighted by the probability score and/or a per-method weight.
+  The consensus prediction is the cell type with the highest total weighted vote.
+  Note that this method does not leverage pre-existing ontology or perform any reconciliation of cell type labels across methods, so the same cell type may be represented by different labels in different methods and will be treated as distinct cell types in the vote.
+authors:
+  - __merge__: /src/authors/dorien_roosen.yaml
+    roles: [ author ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset arguments.
+    arguments:
+      - name: "--input"
+        type: file
+        description: Input h5mu file containing cell type predictions in .obs.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_obs_predictions"
+        type: string
+        description: |
+          One or more .obs column names containing cell type predictions (labels) from
+          different annotation methods.
+        required: true
+        multiple: true
+        example: ["scanvi_pred", "celltypist_pred"]
+      - name: "--input_obs_probabilities"
+        type: string
+        description: |
+          One or more .obs column names containing prediction probability scores,
+          one per method in --input_obs_predictions. When provided, each method's
+          vote is scaled by the probability score for that cell (in addition to
+          any per-method --weights). Must be the same length as --input_obs_predictions.
+        required: false
+        multiple: true
+        example: ["scanvi_prob", "celltypist_prob", "singler_prob"]
+      - name: "--tie_label"
+        type: string
+        description: |
+          Label to assign when two or more cell types receive equal votes.
+          If not provided, tied cells are assigned None (missing value).
+        required: false
+        example: "Unknown"
+      - name: "--weights"
+        type: double
+        description: |
+          Per-method weights for the consensus vote. Must be the same length as
+          --input_obs_predictions when provided. Weights are normalized to sum to 1
+          before use. If not provided, all methods are weighted equally.
+        required: false
+        multiple: true
+        example: [1.0, 2.0]
+
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        alternatives: [-o]
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_obs_predictions"
+        type: string
+        default: consensus_pred
+        required: false
+        description: |
+          In which `.obs` slot to store the consensus predicted cell type.
+      - name: "--output_obs_score"
+        type: string
+        default: consensus_score
+        required: false
+        description: |
+          In which `.obs` slot to store the consensus score, defined as the fraction
+          of total weight assigned to the winning cell type.
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  - path: /src/utils/compress_h5mu.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+
+engines:
+  - type: docker
+    image: python:3.13-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowcpu, lowmem, lowdisk]
--- a/src/annotate/consensus_vote/script.py
+++ b/src/annotate/consensus_vote/script.py
@@ -0,0 +1,123 @@
+import sys
+import mudata as mu
+import numpy as np
+import pandas as pd
+
+## VIASH START
+par = {
+    "input": "test_with_probabilities.h5mu",
+    "modality": "rna",
+    "input_obs_predictions": ["scanvi_pred", "celltypist_pred", "singler_pred"],
+    "input_obs_probabilities": ["scanvi_prob", "celltypist_prob", "singler_prob"],
+    "weights": None,
+    "tie_label": None,
+    "output": "consensus_test_output.h5mu",
+    "output_obs_predictions": "consensus_pred",
+    "output_obs_score": "consensus_score",
+    "output_compression": "gzip",
+}
+meta = {"resources_dir": "src/utils"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+from compress_h5mu import write_h5ad_to_h5mu_with_compression
+
+logger = setup_logger()
+
+
+def main():
+    prediction_cols = par["input_obs_predictions"]
+    prob_cols = par["input_obs_probabilities"]
+    weights = par["weights"]
+
+    if weights and len(weights) != len(prediction_cols):
+        raise ValueError(
+            f"--weights must have the same length as --input_obs_predictions. "
+            f"Got {len(weights)} weights for {len(prediction_cols)} prediction columns."
+        )
+    if prob_cols and len(prob_cols) != len(prediction_cols):
+        raise ValueError(
+            f"--input_obs_probabilities must have the same length as --input_obs_predictions. "
+            f"Got {len(prob_cols)} probability columns for {len(prediction_cols)} prediction columns."
+        )
+
+    logger.info("Reading input data.")
+    adata = mu.read_h5ad(par["input"], mod=par["modality"])
+
+    cols_to_check = [prediction_cols]
+    if prob_cols:
+        cols_to_check.append(prob_cols)
+    for cols in cols_to_check:
+        for col in cols:
+            if col not in adata.obs.columns:
+                raise ValueError(f"Column '{col}' not found in .obs.")
+
+    # Each method is treated equally by default, unless user specific weights are provided
+    n_methods = len(prediction_cols)
+    logger.info("Initializing weights to matrix of ones")
+    weights_arr = np.ones(n_methods, dtype=np.float32)
+    if weights:
+        logger.info("Applying user-provided weights.")
+        weights_arr = np.array(weights, dtype=np.float32)
+    logger.info("Normalizing weights")
+    weights_arr = weights_arr / weights_arr.sum()
+
+    # Apply the weights to the probabilities in the data
+    weights = pd.DataFrame(
+        [weights_arr] * adata.n_obs, index=adata.obs.index, columns=prediction_cols
+    )
+    if prob_cols:
+        logger.info("Scaling the weights with the probabilities from each method")
+        weights = weights * adata.obs[prob_cols].astype(np.float32).to_numpy()
+        assert pd.notna(weights).all(axis=None)
+
+    logger.info("Computing weighted majority vote.")
+    pred_df = adata.obs[prediction_cols].astype(str)
+
+    # For each cell and each method (index), get the label and the weight
+    incidences_weights = pd.DataFrame(
+        {"label": pred_df.stack(), "weights": weights.stack()}
+    )
+    # Move the label to the index, there might be duplicate indices now
+    incidences_weights = incidences_weights.set_index("label", append=True).rename_axis(
+        ["cell_id", "method", "label"]
+    )
+    # Sum the weights per label, from this the labels with the largest weights need to be selected
+    summed_weights = incidences_weights.groupby(level=["cell_id", "label"]).sum()
+    # Find the weight that is the largest per group
+    max_weight_per_group = summed_weights.groupby(level="cell_id").transform("max")
+    # Use the value to look-up the corresponding IDs and labels
+    max_weights_mask = summed_weights["weights"] == max_weight_per_group["weights"]
+    entries_for_max_weights = summed_weights[max_weights_mask].reset_index(
+        level="label"
+    )
+    # Find the cases where there is a tie
+    is_duplicated = max_weights_mask.groupby(level="cell_id").sum() > 1
+    # For the ties, overwrite the label. If a cell is in the frame more than once it is because of a tie.
+    entries_for_max_weights.loc[is_duplicated, ["label"]] = par["tie_label"]
+    # Now its safe to just take the first index in case of duplicates, since the label and the score is the same.
+    entries_for_max_weights = entries_for_max_weights[
+        ~entries_for_max_weights.index.duplicated()
+    ]
+    # Normalize the weights
+    normalized_scores = (
+        entries_for_max_weights["weights"]
+        / incidences_weights["weights"].groupby(level="cell_id").sum()
+    )
+    # Handle devision by 0
+    normalized_scores = normalized_scores.replace([np.inf, -np.inf], 0.0).fillna(0.0)
+    logger.info("Moving the output to the anndata.")
+    adata.obs[par["output_obs_predictions"]] = entries_for_max_weights["label"].astype(
+        "category"
+    )
+    adata.obs[par["output_obs_score"]] = normalized_scores
+
+    logger.info("Writing output data...")
+    write_h5ad_to_h5mu_with_compression(
+        par["output"], par["input"], par["modality"], adata, par["output_compression"]
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/src/annotate/consensus_vote/test.py
+++ b/src/annotate/consensus_vote/test.py
@@ -0,0 +1,303 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+import anndata as ad
+import numpy as np
+import pandas as pd
+from openpipeline_testutils.asserters import assert_annotation_objects_equal
+
+## VIASH START
+meta = {
+    "resources_dir": "resources_test",
+    "executable": "./target/docker/annotate/consensus_vote",
+    "config": "./src/annotate/consensus_vote/config.vsh.yaml",
+}
+## VIASH END
+
+
+@pytest.fixture
+def input_mdata(write_mudata_to_file):
+    np.random.seed(42)
+    n_obs = 100
+    cell_types = ["T cell", "B cell", "NK cell", "Monocyte"]
+    obs = pd.DataFrame(index=[f"cell_{i}" for i in range(n_obs)])
+
+    for method in ["scanvi", "celltypist", "singler"]:
+        preds = np.random.choice(cell_types, size=n_obs)
+        obs[f"{method}_pred"] = pd.Categorical(preds, categories=cell_types)
+        obs[f"{method}_prob"] = np.random.uniform(0.5, 1.0, size=n_obs)
+
+    rna = ad.AnnData(
+        X=np.random.rand(n_obs, 10),
+        obs=obs,
+    )
+    prot = ad.AnnData(X=np.random.rand(n_obs, 5), obs=pd.DataFrame(index=obs.index))
+    mdata = mu.MuData({"rna": rna, "prot": prot})
+    return write_mudata_to_file(mdata)
+
+
+@pytest.fixture
+def tied_mdata(write_mudata_to_file):
+    """Fixture where all cells have a guaranteed tie: two methods always disagree."""
+    n_obs = 10
+    obs = pd.DataFrame(index=[f"cell_{i}" for i in range(n_obs)])
+    obs["method_a_pred"] = pd.Categorical(["T cell"] * n_obs)
+    obs["method_b_pred"] = pd.Categorical(["B cell"] * n_obs)
+    rna = ad.AnnData(X=np.random.rand(n_obs, 5), obs=obs)
+    mdata = mu.MuData({"rna": rna})
+    return write_mudata_to_file(mdata)
+
+
+def test_simple_execution(run_component, random_h5mu_path, input_mdata):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_mdata,
+            "--input_obs_predictions",
+            "scanvi_pred",
+            "--input_obs_predictions",
+            "celltypist_pred",
+            "--input_obs_predictions",
+            "singler_pred",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist."
+
+    input_mudata = mu.read_h5mu(input_mdata)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert {"consensus_pred", "consensus_score"}.issubset(
+        output_mudata.mod["rna"].obs.keys()
+    ), "Required keys not found in .obs."
+
+    predictions = output_mudata.mod["rna"].obs["consensus_pred"]
+    assert predictions.dtype == "category", (
+        "Consensus predictions should be category dtype."
+    )
+    assert not predictions.isna().all(), "Not all consensus predictions should be NA."
+
+    scores = output_mudata.mod["rna"].obs["consensus_score"]
+    assert all(0 <= v <= 1 for v in scores), (
+        "Consensus scores should be in the range [0, 1]."
+    )
+
+
+def test_with_weights(run_component, random_h5mu_path, input_mdata):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_mdata,
+            "--input_obs_predictions",
+            "scanvi_pred",
+            "--input_obs_predictions",
+            "celltypist_pred",
+            "--input_obs_predictions",
+            "singler_pred",
+            "--weights",
+            "1.0",
+            "--weights",
+            "1.0",
+            "--weights",
+            "2.0",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist."
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert {"consensus_pred", "consensus_score"}.issubset(
+        output_mudata.mod["rna"].obs.keys()
+    ), "Required keys not found in .obs."
+
+
+def test_custom_output_obs_names(run_component, random_h5mu_path, input_mdata):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_mdata,
+            "--input_obs_predictions",
+            "scanvi_pred",
+            "--input_obs_predictions",
+            "celltypist_pred",
+            "--output_obs_predictions",
+            "my_consensus_pred",
+            "--output_obs_score",
+            "my_consensus_score",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist."
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert {"my_consensus_pred", "my_consensus_score"}.issubset(
+        output_mudata.mod["rna"].obs.keys()
+    ), "Custom output keys not found in .obs."
+
+
+def test_tie_returns_none_by_default(run_component, random_h5mu_path, tied_mdata):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            tied_mdata,
+            "--input_obs_predictions",
+            "method_a_pred",
+            "--input_obs_predictions",
+            "method_b_pred",
+            "--output",
+            output_file,
+        ]
+    )
+
+    output_mudata = mu.read_h5mu(output_file)
+    predictions = output_mudata.mod["rna"].obs["consensus_pred"]
+    assert predictions.isna().all(), "All tied predictions should be None."
+
+
+def test_tie_label(run_component, random_h5mu_path, tied_mdata):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            tied_mdata,
+            "--input_obs_predictions",
+            "method_a_pred",
+            "--input_obs_predictions",
+            "method_b_pred",
+            "--tie_label",
+            "Unknown",
+            "--output",
+            output_file,
+        ]
+    )
+
+    output_mudata = mu.read_h5mu(output_file)
+    predictions = output_mudata.mod["rna"].obs["consensus_pred"]
+    assert (predictions == "Unknown").all(), "All tied predictions should be 'Unknown'."
+
+
+def test_with_probabilities(run_component, random_h5mu_path, input_mdata):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_mdata,
+            "--input_obs_predictions",
+            "scanvi_pred",
+            "--input_obs_predictions",
+            "celltypist_pred",
+            "--input_obs_predictions",
+            "singler_pred",
+            "--input_obs_probabilities",
+            "scanvi_prob",
+            "--input_obs_probabilities",
+            "celltypist_prob",
+            "--input_obs_probabilities",
+            "singler_prob",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist."
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert {"consensus_pred", "consensus_score"}.issubset(
+        output_mudata.mod["rna"].obs.keys()
+    ), "Required keys not found in .obs."
+    assert not output_mudata.mod["rna"].obs["consensus_pred"].isna().all(), (
+        "Not all probability-weighted consensus predictions should be NA."
+    )
+
+
+def test_mismatched_probabilities_error(run_component, random_h5mu_path, input_mdata):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_mdata,
+                "--input_obs_predictions",
+                "scanvi_pred",
+                "--input_obs_predictions",
+                "celltypist_pred",
+                "--input_obs_probabilities",
+                "scanvi_prob",
+                "--output",
+                output_file,
+            ]
+        )
+    assert re.search(
+        r"ValueError: --input_obs_probabilities must have the same length as --input_obs_predictions",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+def test_mismatched_weights_error(run_component, random_h5mu_path, input_mdata):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_mdata,
+                "--input_obs_predictions",
+                "scanvi_pred",
+                "--input_obs_predictions",
+                "celltypist_pred",
+                "--weights",
+                "1.0",
+                "--output",
+                output_file,
+            ]
+        )
+    assert re.search(
+        r"ValueError: --weights must have the same length as --input_obs_predictions",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+def test_missing_prediction_column_error(run_component, random_h5mu_path, input_mdata):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_mdata,
+                "--input_obs_predictions",
+                "nonexistent_pred",
+                "--output",
+                output_file,
+            ]
+        )
+    assert re.search(
+        r"ValueError: Column 'nonexistent_pred' not found in .obs",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/onclass/config.vsh.yaml
+++ b/src/annotate/onclass/config.vsh.yaml
@@ -0,0 +1,173 @@
+name: onclass
+namespace: annotate
+scope: "public"
+description: |
+  OnClass is a python package for single-cell cell type annotation. It uses the Cell Ontology to capture the cell type similarity. 
+  These similarities enable OnClass to annotate cell types that are never seen in the training data.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: The layer in the input data to be used for cell type annotation if .X is not to be used.
+        required: false
+      - name: "--input_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--input_reference_gene_overlap"
+        type: integer
+        default: 100
+        min: 1
+        description: | 
+          The minimum number of genes present in both the reference and query datasets.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
+
+  - name: Ontology
+    description: Ontology input files
+    arguments:
+      - name: "--cl_nlp_emb_file"
+        type: file
+        description: The .nlp.emb file with the cell type embeddings.
+        required: true
+        example: cl.ontology.nlp.emb
+      - name: "--cl_ontology_file"
+        type: file
+        description: The .ontology file with the cell type ontology.
+        required: true
+        example: cl.ontology
+      - name: "--cl_obo_file"
+        type: file
+        description: The .obo file with the cell type ontology.
+        required: true
+        example: cl.obo
+
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data to be used for cell type annotation if .X is not to be used.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description: The name of the adata obs column in the reference data containing cell type annotations.
+        example: "cell_ontology_class"
+        required: true
+      - name:  "--reference_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--reference_var_input"
+        type: string
+        required: false
+        description: |
+          .var column containing highly variable genes. By default, do not subset genes.
+      - name: "--unknown_celltype"
+        type: string
+        default: "Unknown"
+        required: false
+        description: |
+          Label for unknown cell types.
+
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_obs_predictions"
+        type: string
+        default: onclass_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: onclass_prob 
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+  - name: Model arguments
+    description: Model arguments
+    arguments:
+      - name: "--model"
+        type: string
+        description: | 
+          "Pretrained model path without a file extension. If not provided, the model will be trained 
+          on the reference data and --reference should be provided. The path namespace should contain:
+            - a .npz or .pkl file
+            - a .data file
+            - a .meta file
+            - a .index file
+          e.g. /path/to/model/pretrained_model_target1 as saved by OnClass."
+        required: false
+        direction: input
+      - name: "--max_iter"
+        type: integer
+        default: 30
+        required: false
+        description: Maximum number of iterations for training the model.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  - path: /src/utils/cross_check_genes.py
+  - path: /src/utils/subset_vars.py
+  - path: /src/utils/set_var_index.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+  - type: docker
+    image: python:3.11
+    setup:
+      - type: python
+        packages:
+          - OnClass~=1.3
+          - tensorflow
+          - obonet
+        __merge__: [/src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highcpu, highmem, highdisk]
--- a/src/annotate/onclass/script.py
+++ b/src/annotate/onclass/script.py
@@ -0,0 +1,218 @@
+import sys
+import mudata as mu
+import numpy as np
+from OnClass.OnClassModel import OnClassModel
+import obonet
+from typing import Dict, List, Tuple
+
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    "output": "output.h5mu",
+    "modality": "rna",
+    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    "model": None,
+    # "reference": None,
+    # "model": "resources_test/annotation_test_data/onclass_model/example_file_model",
+    "reference_obs_target": "cell_ontology_class",
+    "input_layer": None,
+    "reference_layer": None,
+    "max_iter": 100,
+    "output_obs_predictions": None,
+    "output_obs_probability": None,
+    "cl_nlp_emb_file": "resources_test/annotation_test_data/ontology/cl.ontology.nlp.emb",
+    "cl_ontology_file": "resources_test/annotation_test_data/ontology/cl.ontology",
+    "cl_obo_file": "resources_test/annotation_test_data/ontology/cl.obo",
+    "output_compression": "gzip",
+    "input_var_gene_names": "gene_symbol",
+    "input_reference_gene_overlap": 100,
+    "reference_var_input": None,
+    "reference_var_gene_names": None,
+    "unkown_celltype": "Unknown",
+}
+meta = {"resources_dir": "src/utils"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+from cross_check_genes import cross_check_genes
+from set_var_index import set_var_index
+from subset_vars import subset_vars
+
+logger = setup_logger()
+
+
+def map_celltype_to_ontology_id(
+    cl_obo_file: str,
+) -> Tuple[Dict[str, str], Dict[str, str]]:
+    """
+    Map cell type names to ontology IDs and vice versa.
+
+    Parameters
+    ----------
+    cl_obo_file : str
+        Path to the cell ontology file.
+
+    Returns
+    -------
+    Tuple[Dict[str, str], Dict[str, str]]
+        A tuple of two dictionaries. The first dictionary maps cell ontology IDs to cell type names.
+        The second dictionary maps cell type names to cell ontology IDs.
+    """
+    graph = obonet.read_obo(cl_obo_file)
+    cl_id_to_name = {id_: data.get("name") for id_, data in graph.nodes(data=True)}
+    cl_id_to_name = {k: v for k, v in cl_id_to_name.items() if v is not None}
+    name_to_cl_id = {v: k for k, v in cl_id_to_name.items()}
+    return cl_id_to_name, name_to_cl_id
+
+
+def cell_type_prediction(
+    model: OnClassModel,
+    input_matrix: np.array,
+    input_features: List[str],
+    id_to_name: dict,
+) -> Tuple[List[str], List[float]]:
+    """
+    Predict cell types for input data and save results to Anndata obj.
+
+    Parameters
+    ----------
+    model : OnClassModel
+        The OnClass model.
+    input_matrix : np.array
+        The input data matrix.
+    input_modality : ad.AnnData
+        The input data Anndata object.
+    id_to_name : dict
+        Dictionary mapping cell ontology IDs to cell type names.
+
+    Returns
+    -------
+    predictions: List[str]
+        The predicted cell types.
+    probabilities: List[float]
+        Probabilities of the predicted cell types.
+    """
+    corr_test_feature = model.ProcessTestFeature(
+        test_feature=input_matrix,
+        test_genes=input_features,
+        log_transform=False,
+    )
+    onclass_pred = model.Predict(
+        corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0
+    )
+    pred_label = [model.i2co[ind] for ind in onclass_pred[2]]
+    pred_cell_type_label = [id_to_name[id] for id in pred_label]
+    prob_cell_type_label = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1)
+
+    return pred_cell_type_label, prob_cell_type_label
+
+
+def main():
+    if (not par["model"] and not par["reference"]) or (
+        par["model"] and par["reference"]
+    ):
+        raise ValueError(
+            "Make sure to provide either 'model' or 'reference', but not both."
+        )
+
+    logger.info("Reading input data")
+    input_mudata = mu.read_h5mu(par["input"])
+    input_adata = input_mudata.mod[par["modality"]]
+    input_modality = input_adata.copy()
+
+    # Set var names to the desired gene name format (gene symbol, ensembl id, etc.)
+    input_modality = set_var_index(
+        input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
+    input_matrix = (
+        input_modality.layers[par["input_layer"]]
+        if par["input_layer"]
+        else input_modality.X
+    )
+    # Onclass needs dense matrix format
+    input_matrix = input_matrix.toarray()
+
+    id_to_name, name_to_id = map_celltype_to_ontology_id(par["cl_obo_file"])
+
+    if par["model"]:
+        logger.info("Predicting cell types using pre-trained model")
+        model = OnClassModel(
+            cell_type_nlp_emb_file=par["cl_nlp_emb_file"],
+            cell_type_network_file=par["cl_ontology_file"],
+        )
+
+        model.BuildModel(use_pretrain=par["model"], ngene=None)
+        cross_check_genes(
+            model.genes, input_modality.var.index, par["input_reference_gene_overlap"]
+        )
+
+    elif par["reference"]:
+        logger.info("Reading reference data")
+        model = OnClassModel(
+            cell_type_nlp_emb_file=par["cl_nlp_emb_file"],
+            cell_type_network_file=par["cl_ontology_file"],
+        )
+
+        reference_mudata = mu.read_h5mu(par["reference"])
+        reference_modality = reference_mudata.mod[par["modality"]].copy()
+        reference_modality = set_var_index(
+            reference_modality,
+            par["reference_var_gene_names"],
+            par["sanitize_ensembl_ids"],
+        )
+
+        # subset to HVG if required
+        if par["reference_var_input"]:
+            reference_modality = subset_vars(
+                reference_modality, par["reference_var_input"]
+            )
+
+        cross_check_genes(
+            input_modality.var.index,
+            reference_modality.var.index,
+            par["input_reference_gene_overlap"],
+        )
+
+        reference_matrix = (
+            reference_modality.layers[par["reference_layer"]]
+            if par["reference_layer"]
+            else reference_modality.X
+        )
+        # Onclass needs dense matrix format
+        reference_matrix = reference_matrix.toarray()
+
+        logger.info("Training a model from reference...")
+
+        labels = reference_modality.obs[par["reference_obs_target"]].tolist()
+        labels_cl = [
+            name_to_id[label] if label in name_to_id else par["unknown_celltype"]
+            for label in labels
+        ]
+
+        _ = model.EmbedCellTypes(labels_cl)
+        corr_train_feature, _, corr_train_genes, _ = model.ProcessTrainFeature(
+            train_feature=reference_matrix,
+            train_label=labels_cl,
+            train_genes=reference_modality.var.index,
+            test_feature=input_matrix,
+            test_genes=input_modality.var.index,
+            log_transform=False,
+        )
+        model.BuildModel(ngene=len(corr_train_genes))
+        model.Train(corr_train_feature, labels_cl, max_iter=par["max_iter"])
+
+    logger.info("Predicting cell types")
+    predictions, probabilities = cell_type_prediction(
+        model, input_matrix, input_modality.var.index, id_to_name
+    )
+
+    logger.info("Writing output data")
+    input_adata.obs[par["output_obs_predictions"]] = predictions
+    input_adata.obs[par["output_obs_probability"]] = probabilities
+    input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
+
+
+if __name__ == "__main__":
+    main()
--- a/src/annotate/onclass/test.py
+++ b/src/annotate/onclass/test.py
@@ -0,0 +1,181 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+from openpipeline_testutils.asserters import assert_annotation_objects_equal
+
+## VIASH START
+meta = {"resources_dir": "resources_test"}
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
+cl_nlp_emb_file = (
+    f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology.nlp.emb"
+)
+cl_ontology_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology"
+cl_obo_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.obo"
+model_file = (
+    f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model"
+)
+
+
+def test_simple_execution(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--cl_nlp_emb_file",
+            cl_nlp_emb_file,
+            "--cl_ontology_file",
+            cl_ontology_file,
+            "--cl_obo_file",
+            cl_obo_file,
+            "--max_iter",
+            "10",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ["onclass_pred", "onclass_prob"]
+
+    obs_values = output_mudata.mod["rna"].obs["onclass_prob"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        ".obs at cell_ontology_class_prob has values outside the range [0, 1]"
+    )
+
+
+def test_custom_obs(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--output_obs_predictions",
+            "dummy_pred_1",
+            "--output_obs_probability",
+            "dummy_prob_1",
+            "--cl_nlp_emb_file",
+            cl_nlp_emb_file,
+            "--cl_ontology_file",
+            cl_ontology_file,
+            "--cl_obo_file",
+            cl_obo_file,
+            "--max_iter",
+            "10",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert set(output_mudata.mod["rna"].obs.keys()) == {"dummy_pred_1", "dummy_prob_1"}
+
+    obs_values = output_mudata.mod["rna"].obs["dummy_prob_1"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        ".obs at dummy_prob_1 has values outside the range [0, 1]"
+    )
+
+
+def test_no_model_no_reference_error(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_file,
+                "--output",
+                output_file,
+                "--cl_nlp_emb_file",
+                cl_nlp_emb_file,
+                "--cl_ontology_file",
+                cl_ontology_file,
+                "--cl_obo_file",
+                cl_obo_file,
+                "--reference_obs_target",
+                "cell_ontology_class",
+                "--reference_var_gene_names",
+                "ensemblid",
+            ]
+        )
+    assert re.search(
+        r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+def test_pretrained_model(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--input_var_gene_names",
+            "gene_symbol",
+            "--sanitize_ensembl_ids",
+            "False",
+            "--cl_nlp_emb_file",
+            cl_nlp_emb_file,
+            "--cl_ontology_file",
+            cl_ontology_file,
+            "--cl_obo_file",
+            cl_obo_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--model",
+            model_file,
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ["onclass_pred", "onclass_prob"]
+
+    obs_values = output_mudata.mod["rna"].obs["onclass_prob"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        ".obs at cell_ontology_class_prob has values outside the range [0, 1]"
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/popv/config.vsh.yaml
+++ b/src/annotate/popv/config.vsh.yaml
@@ -0,0 +1,161 @@
+name: popv
+namespace: "annotate"
+scope: "public"
+description: "Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV."
+authors:
+  - __merge__: /src/authors/matthias_beyens.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/robrecht_cannoodt.yaml
+    roles: [ author ]
+argument_groups:
+  - name: Inputs
+    description: Arguments related to the input (aka query) dataset.
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: Input h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`.
+        required: false
+      - name: "--input_obs_batch"
+        type: string
+        description: Key in obs field of input adata for batch information. If no value is provided, batch label is assumed to be unknown.
+        required: false
+      - name: "--input_var_subset"
+        type: string
+        description: Subset the input object with this column.
+        required: false
+      - name: "--input_obs_label"
+        type: string
+        description: Key in obs field of input adata for label information. This is only used for training scANVI. Unlabelled cells should be set to `"unknown_celltype_label"`.
+        required: false
+      - name: "--unknown_celltype_label"
+        type: string
+        description: If `input_obs_label` is specified, cells with this value will be treated as unknown and will be predicted by the model.
+        default: "unknown"
+        required: false
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "User-provided reference tissue. The data that will be used as reference to call cell types."
+        example: TS_Bladder_filtered.h5ad
+        direction: input
+        required: true
+      - name: "--reference_layer"
+        type: string
+        description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`.
+        required: false
+      - name: "--reference_obs_label"
+        type: string
+        description: Key in obs field of reference AnnData with cell-type information.
+        default: "cell_ontology_class"
+        required: false
+      - name: "--reference_obs_batch"
+        type: string
+        description: Key in obs field of input adata for batch information.
+        default: "donor_assay"
+        required: false
+      # - name: "--reference_models"
+      #   type: file
+      #   description: Pretrained models. Can be a directory or a tar gz.
+      #   required: false
+      #   example: pretrained_models_Bladder_ts.tar.gz
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        required: true
+        example: output.h5mu
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+      # - name: "--output_models"
+      #   type: file
+      #   direction: output
+      #   description: If `prediction_mode == "retrain"`, saves models to a directory and compresses the results into a tar gz.
+      #   example: "output.tar.gz"
+      #   required: false
+  - name: Arguments
+    description: Other arguments.
+    arguments:
+      - name: "--methods"
+        type: string
+        description: "Methods to call cell types. By default, runs KNN_SCVI and SCANVI_POPV."
+        example: ["KNN_SCVI", "SCANVI_POPV"]
+        choices: [CELLTYPIST, KNN_BBKNN, KNN_HARMONY, KNN_SCANORAMA, KNN_SCVI, ONCLASS, Random_Forest, SCANVI_POPV, Support_Vector]
+        required: true
+        multiple: true
+      # - name: "--prediction_mode"
+      #   type: string
+      #   description: |
+      #     Execution mode of cell-type annotation.
+      #     "retrain": Train all prediction models and saves them to disk. Argument `output_models` must be defined.
+      #     "inference": Classify all cells based on pretrained models. Argument `reference_models` must be defined.
+      #     "fast": Fast inference using only query cells and single epoch in scArches.
+      # - name: "--plots"
+      #   type: boolean
+      #   description: "Creation of agreement and frequency plots between selected cell type algorithmn(s) and final PopV ensemble called cell type."
+      #   default: false
+      #   required: false
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+  - type: docker
+    #image: nvcr.io/nvidia/pytorch:22.12-py3
+    image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
+
+    setup:
+      - type: docker
+        env:
+          # Build extensions without AVX512/AVX2 support
+          - CFLAGS="-mno-avx512f -mno-avx2" 
+          - CPPFLAGS="-mno-avx512f -mno-avx2"
+      - type: apt
+        packages:
+          - procps
+          - git
+          - build-essential
+      - type: python
+        packages:
+          - popv~=0.6.1
+      # See https://github.com/YosefLab/PopV/issues/30
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+      # download ontology required by popv
+      - type: docker
+        run: |
+          cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml, .]
+
+runners:
+  - type: executable
+    # docker_run_args: ["--gpus all"]
+  - type: nextflow
+    directives:
+      # TODO: should add new label highmem-single-gpu and lowmem-single-gpu
+      label: [highmem, highcpu, highdisk]
--- a/src/annotate/popv/script.py
+++ b/src/annotate/popv/script.py
@@ -0,0 +1,227 @@
+import sys
+import re
+import tempfile
+import typing
+import numpy as np
+import mudata as mu
+import anndata as ad
+import popv
+
+# todo: is this still needed?
+from torch.cuda import is_available as cuda_is_available
+
+try:
+    from torch.backends.mps import is_available as mps_is_available
+except ModuleNotFoundError:
+    # Older pytorch versions
+    # MacOS GPUs
+    def mps_is_available():
+        return False
+
+
+# where to find the obo files
+cl_obo_folder = "/opt/PopV/resources/ontology/"
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    # "input": "resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu",
+    "modality": "rna",
+    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5ad",
+    "input_obs_batch": None,
+    "input_layer": None,
+    "input_obs_label": None,
+    "input_var_subset": None,
+    "unknown_celltype_label": "unknown",
+    "reference_layer": None,
+    "reference_obs_label": "cell_ontology_class",
+    "reference_obs_batch": "donor_assay",
+    "output": "output.h5mu",
+    "output_compression": "gzip",
+    "methods": [
+        # "celltypist",
+        # "knn_on_bbknn",
+        # "knn_on_scanorama",
+        # "knn_on_scvi",
+        "Random_Forest",
+        # "scanvi",
+        "svm",
+    ],
+}
+meta = {"resources_dir": "src/utils", "temp_dir": "/tmp"}
+# for debugging the obo folder can be somewhere local
+cl_obo_folder = "popv_cl_ontology/"
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+
+logger = setup_logger()
+
+use_gpu = cuda_is_available()
+logger.info("GPU enabled? %s", use_gpu)
+popv.settings.accelerator = "cuda" if use_gpu else "cpu"
+
+
+# Helper functions
+def get_X(
+    adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str]
+):
+    """Fetch the counts data from X or a layer. Subset columns by var_index if so desired."""
+    if var_index:
+        adata = adata[:, var_index]
+    if layer:
+        return adata.layers[layer]
+    else:
+        return adata.X
+
+
+def get_obs(adata: ad.AnnData, obs_par_names):
+    """Subset the obs dataframe to just the columns defined by the obs_label and obs_batch."""
+    obs_columns = [par[x] for x in obs_par_names if par[x]]
+    return adata.obs[obs_columns]
+
+
+def get_var(adata: ad.AnnData, var_index: list[str]):
+    """Fetch the var dataframe. Subset rows by var_index if so desired."""
+    return adata.var.loc[var_index]
+
+
+def main(par, meta):
+    assert len(par["methods"]) >= 1, (
+        "Please, specify at least one method for cell typing."
+    )
+    logger.info("Cell typing methods: {}".format(par["methods"]))
+
+    ### PREPROCESSING REFERENCE ###
+    logger.info("### PREPROCESSING REFERENCE ###")
+
+    # take a look at reference data
+    logger.info("Reading reference data '%s'", par["reference"])
+    reference = ad.read_h5ad(par["reference"])
+
+    logger.info("Setting reference var index to Ensembl IDs")
+    reference.var["gene_symbol"] = list(reference.var.index)
+    reference.var.index = [
+        re.sub("\\.[0-9]+$", "", s) for s in reference.var["ensemblid"]
+    ]
+
+    logger.info("Detect number of samples per label")
+    min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size())
+    n_samples_per_label = np.max((min_celltype_size, 100))
+
+    ### PREPROCESSING INPUT ###
+    logger.info("### PREPROCESSING INPUT ###")
+    logger.info("Reading '%s'", par["input"])
+    input = mu.read_h5mu(par["input"])
+    input_modality = input.mod[par["modality"]]
+
+    # subset with var column
+    if par["input_var_subset"]:
+        logger.info("Subset input with .var['%s']", par["input_var_subset"])
+        assert par["input_var_subset"] in input_modality.var, (
+            f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var"
+        )
+        input_modality = input_modality[:, input_modality.var[par["input_var_subset"]]]
+
+    ### ALIGN REFERENCE AND INPUT ###
+    logger.info("### ALIGN REFERENCE AND INPUT ###")
+
+    logger.info("Detecting common vars based on ensembl ids")
+    common_ens_ids = list(
+        set(reference.var.index).intersection(set(input_modality.var.index))
+    )
+
+    logger.info("  reference n_vars: %i", reference.n_vars)
+    logger.info("  input n_vars: %i", input_modality.n_vars)
+    logger.info("  intersect n_vars: %i", len(common_ens_ids))
+    assert len(common_ens_ids) >= 100, "The intersection of genes is too small."
+
+    # subset input objects to make sure popv is using the data we expect
+    input_modality = ad.AnnData(
+        X=get_X(input_modality, par["input_layer"], common_ens_ids),
+        obs=get_obs(input_modality, ["input_obs_label", "input_obs_batch"]),
+        var=get_var(input_modality, common_ens_ids),
+    )
+    reference = ad.AnnData(
+        X=get_X(reference, par["reference_layer"], common_ens_ids),
+        obs=get_obs(reference, ["reference_obs_label", "reference_obs_batch"]),
+        var=get_var(reference, common_ens_ids),
+    )
+
+    # remove layers that
+
+    ### ALIGN REFERENCE AND INPUT ###
+    logger.info("### ALIGN REFERENCE AND INPUT ###")
+
+    with tempfile.TemporaryDirectory(prefix="popv-", dir=meta["temp_dir"]) as temp_dir:
+        logger.info("Run PopV processing")
+        pq = popv.preprocessing.Process_Query(
+            # input
+            query_adata=input_modality,
+            query_batch_key=par["input_obs_batch"],
+            query_layer_key=None,  # this is taken care of by subset
+            # reference
+            ref_adata=reference,
+            ref_layer_key=None,  # this is taken care of by subset
+            ref_labels_key=par["reference_obs_label"],
+            ref_batch_key=par["reference_obs_batch"],
+            # options
+            unknown_celltype_label=par["unknown_celltype_label"],
+            n_samples_per_label=n_samples_per_label,
+            # pretrained model
+            # Might need to be parameterized at some point
+            prediction_mode="retrain",
+            pretrained_scvi_path=None,
+            # outputs
+            # Might need to be parameterized at some point
+            save_path_trained_models=temp_dir,
+            # hardcoded values
+            cl_obo_folder=cl_obo_folder,
+        )
+        method_kwargs = {}
+        if "scanorama" in par["methods"]:
+            method_kwargs["scanorama"] = {"approx": False}
+        logger.info("Annotate data")
+        popv.annotation.annotate_data(
+            adata=pq.adata, methods=par["methods"], methods_kwargs=method_kwargs
+        )
+
+    popv_input = pq.adata[input_modality.obs_names]
+
+    # select columns starting with "popv_"
+    popv_obs_cols = popv_input.obs.columns[
+        popv_input.obs.columns.str.startswith("popv_")
+    ]
+
+    # create new data frame with selected columns
+    df_popv = popv_input.obs[popv_obs_cols]
+
+    # remove prefix from column names
+    df_popv.columns = df_popv.columns.str.replace("popv_", "")
+
+    # store output in mudata .obsm
+    input.mod[par["modality"]].obsm["popv_output"] = df_popv
+
+    # copy important output in mudata .obs
+    for col in ["popv_prediction"]:
+        if col in popv_input.obs.columns:
+            input.mod[par["modality"]].obs[col] = popv_input.obs[col]
+
+    # code to explore how the output differs from the original
+    # for attr in ["obs", "var", "uns", "obsm", "layers", "obsp"]:
+    #     old_keys = set(getattr(pq_adata_orig, attr).keys())
+    #     new_keys = set(getattr(pq.adata, attr).keys())
+    #     diff_keys = list(new_keys.difference(old_keys))
+    #     diff_keys.sort()
+    #     print(f"{attr}:", flush=True)
+    #     for key in diff_keys:
+    #         print(f"  {key}", flush=True)
+
+    # write output
+    logger.info("Writing %s", par["output"])
+    input.write_h5mu(par["output"], compression=par["output_compression"])
+
+
+if __name__ == "__main__":
+    main(par, meta)
--- a/src/annotate/popv/test.py
+++ b/src/annotate/popv/test.py
@@ -0,0 +1,95 @@
+import sys
+import os
+import pytest
+import mudata as mu
+
+## VIASH START
+meta = {"resources_dir": "resources_test"}
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad"
+
+
+def test_simple_execution(run_component):
+    output_file = "output.h5mu"
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--output",
+            "output.h5mu",
+            "--methods",
+            "Random_Forest;Support_Vector",
+        ]
+    )
+
+    # check whether file exists
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    # read output mudata
+    output = mu.read_h5mu(output_file)
+
+    # check output
+    expected_rna_obs_cols = ["popv_prediction"]
+    for col in expected_rna_obs_cols:
+        assert col in output.mod["rna"].obs.columns, (
+            f"could not find columns .mod['rna'].obs['{col}']"
+        )
+
+    print(f"output: {output}", flush=True)
+
+
+def test_popv_with_other_layer(run_component, tmp_path):
+    input_h5mu = mu.read(input_file)
+    input_h5mu.mod["rna"].layers["test"] = input_h5mu.mod["rna"].X.copy()
+    input_h5mu.write_h5mu(tmp_path / "input.h5mu")
+    run_component(
+        [
+            "--input",
+            tmp_path / "input.h5mu",
+            "--reference",
+            reference_file,
+            "--output",
+            "output.h5mu",
+            "--methods",
+            "Random_Forest;Support_Vector;KNN_SCANORAMA;KNN_SCVI",
+        ]
+    )
+
+
+def test_popv_with_non_overlapping_cells(run_component, tmp_path):
+    input_h5mu = mu.read(input_file)
+
+    # copy previous modalities
+    rna_ad = input_h5mu.mod["rna"].copy()
+    prot_ad = input_h5mu.mod["prot"].copy()
+
+    # change obs_names such that the cells do not overlap
+    rna_ad.obs_names = [f"rna_{x}" for x in rna_ad.obs_names]
+    prot_ad.obs_names = [f"prot_{x}" for x in prot_ad.obs_names]
+
+    # write new h5mu to file
+    new_h5mu = mu.MuData({"rna": rna_ad, "prot": prot_ad})
+    new_h5mu.write_h5mu(tmp_path / "input.h5mu")
+
+    # run component
+    run_component(
+        [
+            "--input",
+            tmp_path / "input.h5mu",
+            "--reference",
+            reference_file,
+            "--output",
+            "output.h5mu",
+            "--methods",
+            "Random_Forest;Support_Vector;KNN_SCANORAMA",
+        ]
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/random_forest_annotation/config.vsh.yaml
+++ b/src/annotate/random_forest_annotation/config.vsh.yaml
@@ -0,0 +1,170 @@
+name: random_forest_annotation
+namespace: annotate
+scope: "public"
+description: Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: The layer in the input data to be used for cell type annotation if .X is not to be used. 
+      - name: "--input_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--input_reference_gene_overlap"
+        type: integer
+        default: 100
+        min: 1
+        description: | 
+          The minimum number of genes present in both the reference and query datasets.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
+
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description: Key in obs field of reference modality with cell-type information.
+        required: true
+      - name:  "--reference_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--reference_var_input"
+        type: string
+        required: false
+        description: |
+          .var column containing highly variable genes. By default, do not subset genes.
+
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_obs_predictions"
+        type: string
+        default: random_forest_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: random_forest_probability
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+  
+  - name: Model arguments
+    description: Model arguments.
+    arguments:
+      - name: "--model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+      - name: "--n_estimators"
+        type: integer
+        required: false
+        default: 100
+        description: Number of trees in the random forest.
+      - name: "--max_depth"
+        type: integer
+        required: false
+        description: |
+          Maximum depth of the trees in the random forest. 
+          If not provided, the nodes are expanded until all leaves only contain a single sample.
+      - name: "--criterion"
+        type: string
+        required: false
+        choices: ["gini", "entropy", "log_loss"]
+        default: "gini"
+        description: The function to measure the quality of a split.
+      - name: "--class_weight"
+        type: string
+        required: false
+        default: "balanced_subsample"
+        choices: ["balanced", "balanced_subsample", "uniform"]
+        description: |
+          Weights associated with classes.
+          The `balanced` mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data.
+          The `balanced_subsample` mode is the same as `balanced` except that weights are computed based on the bootstrap sample for every tree grown.
+          The `uniform` mode gives all classes a weight  of one.
+      - name: "--max_features"
+        type: string
+        default: "200"
+        description: |
+          The number of features to consider when looking for the best split. The value can either be a positive integer or one of `sqrt`, `log2` or `all`.
+          If integer: consider max_features features at each split.
+          If `sqrt`: max_features is the squareroot of all input features.
+          If `log2`: max_features is the log2 of all input features.
+          If `all`: max features equals all input features.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  - path: /src/utils/cross_check_genes.py
+  - path: /src/utils/subset_vars.py
+  - path: /src/utils/set_var_index.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+  - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu
+
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - libhdf5-dev
+          - procps
+      - type: python
+        packages:
+          - scikit-learn==1.4.2
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highcpu, highmem, highdisk]
--- a/src/annotate/random_forest_annotation/script.py
+++ b/src/annotate/random_forest_annotation/script.py
@@ -0,0 +1,162 @@
+import sys
+import mudata as mu
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+import pickle
+
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    "output": "output.h5mu",
+    "input_var_gene_names": None,
+    "input_reference_gene_overlap": 100,
+    "modality": "rna",
+    "reference": None,
+    # "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    # "model": None,
+    "model": "resources_test/annotation_test_data/rf_model.pkl",
+    "reference_obs_target": "cell_ontology_class",
+    "reference_var_gene_names": "ensemblid",
+    "reference_var_input": None,
+    "input_layer": None,
+    "reference_layer": None,
+    "n_estimators": 100,
+    "criterion": "gini",
+    "max_depth": None,
+    "class_weight": None,
+    "max_features": 200,
+    "output_compression": "gzip",
+    "output_obs_predictions": "random_forest_pred",
+    "output_obs_probability": "random_forest_probability",
+}
+meta = {"resources_dir": "src/utils"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+from cross_check_genes import cross_check_genes
+from subset_vars import subset_vars
+from set_var_index import set_var_index
+
+logger = setup_logger()
+
+
+def main():
+    logger.info("Reading input data")
+    input_mudata = mu.read_h5mu(par["input"])
+    input_adata = input_mudata.mod[par["modality"]]
+    input_modality = input_adata.copy()
+    input_modality = set_var_index(
+        input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
+
+    # Handle max_features parameter
+    max_features_conversion = {
+        "all": None,
+        "sqrt": "sqrt",
+        "log2": "log2",
+    }
+    try:
+        max_features = max_features_conversion.get(
+            par["max_features"], int(par["max_features"])
+        )
+    except ValueError:
+        raise ValueError(
+            f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'"
+        )
+
+    if (not par["model"] and not par["reference"]) or (
+        par["model"] and par["reference"]
+    ):
+        raise ValueError(
+            "Make sure to provide either 'model' or 'reference', but not both."
+        )
+
+    if par["model"]:
+        logger.info("Loading a pre-trained model")
+        model = pickle.load(open(par["model"], "rb"))
+        if hasattr(model, "_feature_names_in"):
+            common_genes = cross_check_genes(
+                input_modality.var.index,
+                model._feature_names_in,
+                par["input_reference_gene_overlap"],
+            )
+            if not len(common_genes) == len(model._feature_names_in):
+                raise ValueError("Input dataset does not contain all model features.")
+            input_modality = input_modality[:, common_genes]
+            input_matrix = (
+                input_modality.layers[par["input_layer"]]
+                if par["input_layer"]
+                else input_modality.X
+            )
+
+        else:
+            logger.warning(
+                "Model does not have feature names saved. Could not check overlap of model's features with query genes."
+            )
+
+    elif par["reference"]:
+        logger.info("Reading reference data")
+
+        reference_mudata = mu.read_h5mu(par["reference"])
+        reference_modality = reference_mudata.mod[par["modality"]].copy()
+        reference_modality = set_var_index(
+            reference_modality,
+            par["reference_var_gene_names"],
+            par["sanitize_ensembl_ids"],
+        )
+
+        # subset to HVG if required
+        if par["reference_var_input"]:
+            reference_modality = subset_vars(
+                reference_modality, par["reference_var_input"]
+            )
+
+        # Query and input require the exact same features
+        common_genes = cross_check_genes(
+            input_modality.var.index,
+            reference_modality.var.index,
+            par["input_reference_gene_overlap"],
+        )
+        reference_modality = reference_modality[:, common_genes]
+        input_modality = input_modality[:, common_genes]
+
+        reference_matrix = (
+            reference_modality.layers[par["reference_layer"]]
+            if par["reference_layer"]
+            else reference_modality.X
+        )
+        input_matrix = (
+            input_modality.layers[par["input_layer"]]
+            if par["input_layer"]
+            else input_modality.X
+        )
+
+        logger.info("Training a model...")
+        labels = reference_modality.obs[par["reference_obs_target"]].to_numpy()
+        model = RandomForestClassifier(
+            n_estimators=par["n_estimators"],
+            criterion=par["criterion"],
+            max_depth=par["max_depth"],
+            class_weight=par["class_weight"]
+            if not par["class_weight"] == "uniform"
+            else None,
+            max_features=max_features,
+        )
+        model.fit(reference_matrix, labels)
+        model._feature_names_in = reference_modality.var.index
+
+    logger.info("Running predictions...")
+    predictions = model.predict(input_matrix)
+    probabilities = np.max(model.predict_proba(input_matrix), axis=1)
+
+    input_adata.obs[par["output_obs_predictions"]] = predictions
+    input_adata.obs[par["output_obs_probability"]] = probabilities
+
+    logger.info("Writing output data")
+    input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
+
+
+if __name__ == "__main__":
+    main()
--- a/src/annotate/random_forest_annotation/test.py
+++ b/src/annotate/random_forest_annotation/test.py
@@ -0,0 +1,240 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+from openpipeline_testutils.asserters import assert_annotation_objects_equal
+from sklearn.ensemble import RandomForestClassifier
+import pickle
+
+## VIASH START
+meta = {"resources_dir": "resources_test"}
+sys.path.append("src/utils")
+## VIASH END
+
+input_file = (
+    f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+)
+reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
+
+sys.path.append(meta["resources_dir"])
+from cross_check_genes import cross_check_genes
+from set_var_index import set_var_index
+
+
+@pytest.fixture
+def dummy_model(tmp_path):
+    reference_modality = mu.read_h5mu(reference_file).mod["rna"].copy()
+    reference_modality = set_var_index(reference_modality, "ensemblid")
+
+    input_modality = mu.read_h5mu(input_file).mod["rna"].copy()
+    input_modality = set_var_index(input_modality, None)
+
+    common_genes = cross_check_genes(
+        input_modality.var.index, reference_modality.var.index
+    )
+    reference_modality = reference_modality[:, common_genes]
+
+    labels = reference_modality.obs["cell_ontology_class"].to_numpy()
+    model = RandomForestClassifier()
+    model.fit(reference_modality.X, labels)
+    model._feature_names_in = reference_modality.var.index
+
+    model_path = tmp_path / "model.pkl"
+    with open(model_path, "wb") as f:
+        pickle.dump(model, f)
+
+    return model_path
+
+
+def test_simple_execution(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == [
+        "random_forest_pred",
+        "random_forest_probability",
+    ]
+
+    obs_values = output_mudata.mod["rna"].obs["random_forest_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        "probabilities outside the range [0, 1]"
+    )
+
+
+def test_custom_out_obs_model_params(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--output_obs_predictions",
+            "dummy_pred",
+            "--output_obs_probability",
+            "dummy_probability",
+            "--n_estimators",
+            "10",
+            "--criterion",
+            "entropy",
+            "--max_depth",
+            "5",
+            "--class_weight",
+            "balanced",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == [
+        "dummy_pred",
+        "dummy_probability",
+    ]
+
+    obs_values = output_mudata.mod["rna"].obs["dummy_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        "probabilities outside the range [0, 1]"
+    )
+
+
+def test_with_model(run_component, random_h5mu_path, dummy_model):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--model",
+            dummy_model,
+            "--output",
+            output_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == [
+        "random_forest_pred",
+        "random_forest_probability",
+    ]
+
+    obs_values = output_mudata.mod["rna"].obs["random_forest_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        "probabilities outside the range [0, 1]"
+    )
+
+
+def test_no_model_no_reference_error(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_file,
+                "--output",
+                output_file,
+                "--reference_obs_target",
+                "cell_ontology_class--reference_var_gene_names",
+                "ensemblid",
+            ]
+        )
+    assert re.search(
+        r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+def test_model_and_reference_error(run_component, random_h5mu_path, dummy_model):
+    output_file = random_h5mu_path()
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_file,
+                "--output",
+                output_file,
+                "--reference",
+                reference_file,
+                "--reference_obs_target",
+                "cell_ontology_class",
+                "--reference_var_gene_names",
+                "ensemblid",
+                "--model",
+                dummy_model,
+            ]
+        )
+    assert re.search(
+        r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+def test_invalid_max_features(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_file,
+                "--output",
+                output_file,
+                "--reference_obs_target",
+                "cell_ontology_class",
+                "--reference_var_gene_names",
+                "ensemblid",
+                "--max_features",
+                "invalid_value",
+            ]
+        )
+    assert re.search(
+        r"Invaldid value invalid_value for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/scanvi/config.vsh.yaml
+++ b/src/annotate/scanvi/config.vsh.yaml
@@ -0,0 +1,169 @@
+name: scanvi
+namespace: "annotate"
+scope: "public"
+description: |
+  scANVI () is a semi-supervised model for single-cell transcriptomics data. scANVI is an scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.
+  This component will instantiate a scANVI model from a pre-trained scVI model, integrate the data and perform label prediction.
+
+authors:
+  - __merge__: /src/authors/dorien_roosen.yaml
+    roles: [ maintainer ]
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        alternatives: ["-i"]
+        type: file
+        description: Input h5mu file. Note that this needs to be the exact same dataset as the --scvi_model was trained on.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: |
+          Which modality from the input MuData file to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        required: false
+        description: "Input layer to use. If None, X is used"
+      - name: "--var_input"
+        type: string
+        required: false
+        description: ".var column containing highly variable genes that were used to train the scVi model. By default, do not subset genes."
+      - name: "--var_gene_names"
+        type: string
+        required: false
+        description: ".var column containing gene names. By default, use the index."
+      - name: "--obs_labels"
+        type: string
+        required: true
+        description: ".obs field containing the labels"
+      - name: "--unlabeled_category"
+        type: string
+        default: "Unknown"
+        description: | 
+          Value in the --obs_labels field that indicates unlabeled observations
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
+
+  - name: scVI Model
+    arguments:
+      - name: "--scvi_model"
+        type: file
+        description: "Pretrained SCVI reference model to initialize the SCANVI model with."
+        example: scvi_model.pt
+        direction: input
+        required: true
+  
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        alternatives: ["-o"]
+        type: file
+        example: output.h5mu
+        description: Output h5mu file.
+        direction: output
+        required: true
+      - name: "--output_model"
+        type: file
+        description: Folder where the state of the trained model will be saved to.
+        required: false
+        direction: output
+        example: path/to/output_model/
+      - name: "--obsm_output"
+        type: string
+        default: "X_scanvi_integrated"
+        required: false
+        description: "In which .obsm slot to store the resulting integrated embedding."
+      - name: "--obs_output_predictions"
+        type: string
+        default: scanvi_pred
+        description: "In which .obs slot to store the predicted labels."
+      - name: "--obs_output_probabilities"
+        type: string
+        default: scanvi_proba
+        description: "In which. obs slot to store the probabilities of the predicted labels."
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+ 
+  - name: "scANVI training arguments"
+    arguments:
+      - name: "--early_stopping"
+        required: false
+        type: boolean
+        description: "Whether to perform early stopping with respect to the validation set."
+      - name: "--early_stopping_monitor"
+        choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"]
+        default: "elbo_validation"
+        type: string
+        description: "Metric logged during validation set epoch."
+      - name: "--early_stopping_patience"
+        type: integer
+        min: 1
+        default: 45
+        description: "Number of validation epochs with no improvement after which training will be stopped."
+      - name: "--early_stopping_min_delta"
+        min: 0
+        type: double
+        default: 0.0
+        description: "Minimum change in the monitored quantity to qualify as an improvement, 
+                      i.e. an absolute change of less than min_delta, will count as no improvement."
+      - name: "--max_epochs"
+        type: integer
+        description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest."
+        required: false
+      - name: "--reduce_lr_on_plateau"
+        description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus."
+        type: boolean
+        default: True
+      - name: "--lr_factor"
+        description: "Factor to reduce learning rate."
+        type: double
+        default: 0.6
+        min: 0
+      - name: "--lr_patience"
+        description: "Number of epochs with no improvement after which learning rate will be reduced."
+        type: double
+        default: 30
+        min: 0
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/subset_vars.py
+  - path: /src/utils/compress_h5mu.py
+  - path: /src/utils/set_var_index.py
+  - path: /src/utils/setup_logger.py
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/scvi_model/
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+  - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+
+engines:
+- type: docker
+  image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
+  setup:
+    - type: python
+      __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
+    - type: python
+      packages:
+        - scvi-tools[cuda]~=1.4.1
+  test_setup:
+    - type: python
+      __merge__: [ /src/base/requirements/viashpy.yaml, .]
+
+runners:
+- type: executable
+  # docker_run_args: ["--gpus all"]
+- type: nextflow
+  directives:
+    label: [midcpu, midmem, gpu, highdisk]
--- a/src/annotate/scanvi/script.py
+++ b/src/annotate/scanvi/script.py
@@ -0,0 +1,112 @@
+import mudata
+import scvi
+import numpy as np
+
+### VIASH START
+par = {
+    "input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    "modality": "rna",
+    "input_layer": None,
+    "var_input": None,
+    "output": "foo.h5mu",
+    "obsm_output": "X_scvi_integrated",
+    "early_stopping": True,
+    "early_stopping_monitor": "elbo_validation",
+    "early_stopping_patience": 45,
+    "early_stopping_min_delta": 0,
+    "reduce_lr_on_plateau": True,
+    "lr_factor": 0.6,
+    "lr_patience": 30,
+    "max_epochs": 500,
+    "n_obs_min_count": 10,
+    "n_var_min_count": 10,
+    "output_model": "test/",
+    "output_compression": "gzip",
+}
+
+meta = {"resources_dir": "src/utils"}
+### VIASH END
+
+import sys
+
+sys.path.append(meta["resources_dir"])
+
+from subset_vars import subset_vars
+from set_var_index import set_var_index
+from compress_h5mu import write_h5ad_to_h5mu_with_compression
+from setup_logger import setup_logger
+
+logger = setup_logger()
+
+
+def main():
+    logger.info("Reading input data...")
+    adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"])
+
+    if par["var_input"]:
+        # Subset to HVG
+        adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy()
+    else:
+        adata_subset = adata.copy()
+
+    # Sanitize gene names and set as index of the AnnData object
+    adata_subset = set_var_index(
+        adata_subset, par["var_gene_names"], par["sanitize_ensembl_ids"]
+    )
+
+    logger.info(f"Loading pre-trained scVI model from {par['scvi_model']}")
+    scvi_model = scvi.model.SCVI.load(
+        par["scvi_model"],
+        adata_subset,
+        accelerator="auto",
+        device="auto",
+    )
+
+    logger.info("Instantiating scANVI model from scVI model...")
+    vae_uns = scvi.model.SCANVI.from_scvi_model(
+        scvi_model,
+        unlabeled_category=par["unlabeled_category"],
+        labels_key=par["obs_labels"],
+        adata=adata_subset,
+    )
+
+    plan_kwargs = {
+        "reduce_lr_on_plateau": par["reduce_lr_on_plateau"],
+        "lr_patience": par["lr_patience"],
+        "lr_factor": par["lr_factor"],
+    }
+
+    logger.info("Training scANVI model...")
+    # Train the model
+    vae_uns.train(
+        max_epochs=par["max_epochs"],
+        early_stopping=par["early_stopping"],
+        early_stopping_monitor=par["early_stopping_monitor"],
+        early_stopping_patience=par["early_stopping_patience"],
+        early_stopping_min_delta=par["early_stopping_min_delta"],
+        plan_kwargs=plan_kwargs,
+        check_val_every_n_epoch=1,
+        accelerator="auto",
+    )
+    # Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set
+
+    logger.info("Performing scANVI integration...")
+    # Get the latent output
+    adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation()
+
+    logger.info("Performing scANVI prediction...")
+    adata.obs[par["obs_output_predictions"]] = vae_uns.predict()
+    adata.obs[par["obs_output_probabilities"]] = np.max(
+        vae_uns.predict(soft=True), axis=1
+    )
+
+    logger.info("Writing output data...")
+    write_h5ad_to_h5mu_with_compression(
+        par["output"], par["input"], par["modality"], adata, par["output_compression"]
+    )
+    if par["output_model"]:
+        vae_uns.save(par["output_model"], overwrite=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/annotate/scanvi/test.py
+++ b/src/annotate/scanvi/test.py
@@ -0,0 +1,127 @@
+import pytest
+import re
+import subprocess
+from pathlib import Path
+
+import mudata
+from anndata.tests.helpers import assert_equal
+
+## VIASH START
+meta = {
+    "executable": "./target/executable/integrate/scanvi/scanvi",
+    "resources_dir": "./resources_test/pbmc_1k_protein_v3/",
+}
+## VIASH END
+
+import sys
+
+sys.path.append(meta["resources_dir"])
+
+input_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
+input_file_2 = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu"
+scvi_model = f"{meta['resources_dir']}/scvi_model"
+
+
+def test_scanvi(run_component):
+    args = [
+        "--input",
+        input_file,
+        "--modality",
+        "rna",
+        "--obs_labels",
+        "cell_ontology_class",
+        "--scvi_model",
+        scvi_model,
+        "--output",
+        "output.h5mu",
+        "--output_model",
+        "scanvi_model",
+        "--max_epochs",
+        "5",
+        "--output_compression",
+        "gzip",
+    ]
+
+    run_component(args)
+
+    input_rna = mudata.read_h5ad(input_file.strip(), mod="rna")
+
+    # check files
+    assert Path("output.h5mu").is_file(), "Output file does not exist"
+    assert Path("scanvi_model").is_dir()
+    assert Path("scanvi_model/model.pt").is_file()
+
+    # check output h5mu
+    output_data = mudata.read_h5mu("output.h5mu")
+    output_rna = output_data.mod["rna"]
+    assert output_rna.n_obs == input_rna.n_obs, (
+        f"Number of observations changed\noutput_data: {output_data}"
+    )
+    assert output_rna.n_vars == input_rna.n_vars, (
+        f"Number of variables changed\noutput_data: {output_data}"
+    )
+
+    assert "X_scanvi_integrated" in output_rna.obsm, (
+        f".obsm['X_scanvi_integrated'] not added\noutput_data: {output_data}"
+    )
+
+    assert "scanvi_pred" in output_rna.obs, (
+        f".obs['scanvi_pred'] not added\noutput_data: {output_data}"
+    )
+
+    assert "scanvi_proba" in output_rna.obs, (
+        f".obs['scanvi_proba'] not added\noutput_data: {output_data}"
+    )
+
+    predictions = output_data.mod["rna"].obs["scanvi_pred"]
+    probabilities = output_data.mod["rna"].obs["scanvi_proba"]
+
+    assert predictions.dtype == "category", (
+        "Calculated predictions should be category dtype"
+    )
+    assert not all(predictions.isna()), "Not all predictions should be NA"
+    assert probabilities.dtype == "float32", (
+        "Calculated probabilities should be float32 dtype"
+    )
+    assert all(0 <= value <= 1 for value in probabilities), (
+        ".obs at celltypist_probability has values outside the range [0, 1]"
+    )
+
+    # assert that nothing else has changed
+    del output_rna.obsm["X_scanvi_integrated"]
+    del output_rna.obs["scanvi_pred"]
+    del output_rna.obs["scanvi_proba"]
+
+    assert_equal(input_rna, output_rna)
+
+
+def test_raises_with_noncompatible_input_file(run_component):
+    args = [
+        "--input",
+        input_file_2,
+        "--modality",
+        "rna",
+        "--obs_labels",
+        "cell_ontology_class",
+        "--scvi_model",
+        scvi_model,
+        "--output",
+        "output.h5mu",
+        "--output_model",
+        "scanvi_model",
+        "--max_epochs",
+        "5",
+        "--output_compression",
+        "gzip",
+    ]
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(args)
+    assert re.search(
+        r"ValueError: Number of vars in `adata_target` not the same as source.",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/singler/config.vsh.yaml
+++ b/src/annotate/singler/config.vsh.yaml
@@ -0,0 +1,199 @@
+name: singler
+namespace: annotate
+scope: "public"
+description: |
+  SingleR performs reference-based cell type annotation for single-cell RNA-seq data 
+  by computing Spearman correlations between test cells and reference samples with known labels, 
+  using marker genes to assign the most similar cell type label to each new cell.
+
+authors:
+  - __merge__: /src/authors/dorien_roosen.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        default: "log_normalized"
+        description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used. 
+      - name: "--input_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata .var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--input_obs_clusters"
+        type: string
+        required: false
+        description: |
+          The name of the adata .obs column containing cluster identities of the observations. 
+          If provided, annoation is performed on the aggregated cluster profiles, 
+          otherwise it defaults to annotation per observation.
+      - name: "--input_reference_gene_overlap"
+        type: integer
+        default: 100
+        min: 1
+        description: | 
+          The minimum number of genes present in both the reference and query datasets.
+  
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: true
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data containing lognormalized couns to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        required: true
+        description: The name of the adata obs column in the reference data containing cell type annotations.
+      - name: "--reference_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--reference_var_input"
+        type: string
+        default: "filter_with_hvg"
+        required: false
+        description: |
+          .var column containing a boolean mask corresponding to genes to be used for marker selection. If not provided, genes will not be subset.
+
+  - name: Arguments
+    description: Arguments related to the training of and classification with the SingleR model
+    arguments:
+      - name: "--de_n_genes"
+        type: integer
+        required: False
+        min: 1
+        description: |
+          The number of differentially expressed genes across labels to be calculated from the reference.
+          Defaults to 500 * (2/3) ^ log2(N) where N is the number of unique labels when if `--de_method` is set to `classic`,
+          otherwise, defaults to 10.
+      - name: "--de_method"
+        type: string
+        default: "classic"
+        choices: ["classic", "t", "wilcox"]
+        description: Method to detect differentially expressed genes between pairs of labels. 
+      - name: "--quantile"
+        type: double
+        min: 0
+        max: 1
+        default: 0.8
+        description: The quantile of the correlation distribution to use to compute the score per label.
+      - name: "--fine_tune"
+        type: boolean
+        default: true
+        description: |
+          Whether finetuning should be performed to improve the resolution. 
+          If set to True, an additional finetuning step is performed after initial classification, 
+          new marker genes are calculated based on all cells with a score higher then the maximum score minus `--fine_tuning_thershold`,
+          and the calculation of the scores is repeated.
+      - name: "--fine_tuning_threshold"
+        type: double
+        default: 0.05
+        description: |
+          The maximum difference from the maximum correlation to use in fine-tuning
+      - name: "--prune"
+        type: boolean
+        default: true
+        description: 
+          Whether label pruning should be performed. 
+          If set to True, an additional output .obs field `--output_obs_pruned_predictions` will be added to the `--output`,
+          containing labels where 'low-quality' labels are replaced with NA's.
+          Labels are considered 'low-quality' when their delta score (stored in `--output_obs_delta_next`) fall more than 3 median absolute deviations below the median for that label type.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_obs_predictions"
+        type: string
+        default: singler_pred
+        description: |
+          In which `.obs` slot to store the predicted labels. If `--fine_tune False`, this is based only on the maximum entry in `--output_obsm_scores`.
+      - name: "--output_obs_probability"
+        type: string
+        default: singler_probability
+        description: |
+          In which `.obs` slots to store the probability of the predicted labels.
+      - name: "--output_obs_delta_next"
+        type: string
+        default: singler_delta_next
+        description: |
+          In which `.obs` slot to store the delta between the best and next-best score. If `--fine_tune True`, this is reported for scores after fine-tuning.
+      - name: "--output_obs_pruned_predictions"
+        type: string
+        default: singler_pruned_labels
+        description: |
+          In which `.obs` slot to store the pruned labels, where low-quality labels are replaced with NA's. Only added if `--prune True`.
+      - name: "--output_obsm_scores"
+        type: string
+        default: singler_scores
+        description: |
+          In which `.obsm` slot to store the matrix of prediction correlations at the specified quantile for each label (column) in each cell (row).
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+resources:
+  - type: r_script
+    path: script.R
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+  - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu
+
+engines:
+  - type: docker
+    image: rocker/r2u:24.04
+    setup:
+      - type: docker
+        env: 
+          - RETICULATE_PYTHON=/usr/bin/python
+          - PIP_BREAK_SYSTEM_PACKAGES=1
+      - type: apt
+        packages: [ libhdf5-dev, python3, python3-pip, python3-dev, python-is-python3 ]
+      - type: r
+        cran: [ anndata, reticulate, SingleR ]
+        bioc: [ scrapper, bit64 ]
+      - type: python
+        user: true
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml ]
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/scanpy.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowmem, lowcpu]
--- a/src/annotate/singler/script.R
+++ b/src/annotate/singler/script.R
@@ -0,0 +1,202 @@
+library(SingleR)
+library(Matrix)
+requireNamespace("anndata", quietly = TRUE)
+requireNamespace("reticulate", quietly = TRUE)
+mudata <- reticulate::import("mudata")
+
+### VIASH START
+par <- list(
+  input = "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+  modality = "rna",
+  input_layer = NULL,
+  input_var_gene_names = "gene_symbol",
+  input_reference_gene_overlap = 100,
+  input_obs_clusters = NULL,
+  reference = "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+  reference_layer = NULL,
+  reference_var_input = NULL,
+  reference_var_gene_names = NULL,
+  reference_obs_target = "cell_ontology_class",
+  output = "singler_output.h5mu",
+  output_compression = "gzip",
+  output_obs_predictions = "singler_labels",
+  output_obs_probability = "singler_proba",
+  output_obsm_scores = "single_r_scores",
+  output_obs_delta_next = "singler_delta_next",
+  output_obs_pruned_predictions = "singler_pruned_labels",
+  quantile = 0.8,
+  fine_tune = TRUE,
+  fine_tuning_threshold = 0.05,
+  prune = TRUE,
+  de_n_genes = NULL,
+  de_method = "classic"
+)
+meta <- list(
+  cpus = 4
+)
+
+### VIASH END
+
+get_layer <- function(adata, layer, var_gene_names) {
+  # find data
+  data <-
+    if (is.null(layer)) {
+      adata$X
+    } else {
+      adata$layers[[layer]]
+    }
+
+  # check if data is available
+  if (is.null(data)) {
+    if (is.null(layer)) {
+      stop("No layer specified and no .X slot available in the AnnData object.")
+    } else {
+      stop(
+        "Requested layer '",
+        layer,
+        "' is not available in the AnnData object. Available layers: ",
+        paste(names(adata$layers), collapse = ", ")
+      )
+    }
+  }
+
+  # Set matrix dimnames
+  input_gene_names <- sanitize_ensembl_ids(adata, var_gene_names)
+  dimnames(data) <- list(adata$obs_names, input_gene_names)
+
+  # return output
+  data
+}
+
+sanitize_ensembl_ids <- function(adata, gene_symbol = NULL) {
+  if (is.null(gene_symbol)) {
+    gene_names <- adata$var_names
+  } else {
+    gene_names <- adata$var[[gene_symbol]]
+  }
+
+  # Pattern matches Ensembl IDs: starts with ENS, followed by any characters,
+  # then an eleven digit number, optionally followed by .version_number
+  ensembl_pattern <- "^(ENS.*\\d{11})(?:\\.\\d+)?$"
+
+  # Remove version numbers for ensembl ids only
+  sanitized <- ifelse(
+    grepl(ensembl_pattern, gene_names, perl = TRUE),
+    gsub(ensembl_pattern, "\\1", gene_names, perl = TRUE),
+    as.character(gene_names)
+  )
+
+  sanitized
+}
+
+subset_vars <- function(adata, subset_col) {
+  # Check if column exists
+  if (!subset_col %in% colnames(adata$var)) {
+    stop(
+      "Requested to use .var column '",
+      subset_col,
+      "' as a selection of genes, but the column is not available."
+    )
+  }
+
+  # Get the column
+  subset_values <- adata$var[[subset_col]]
+
+  # Check for NA values
+  if (any(is.na(subset_values))) {
+    stop(
+      "The .var column `",
+      subset_col,
+      "` contains NA values. Can not subset data."
+    )
+  }
+
+  # Check if it's logical/boolean
+  if (!is.logical(subset_values)) {
+    stop(
+      "Expected dtype of .var column '",
+      subset_col,
+      "' to be `logical`, but found ",
+      class(subset_values)[1],
+      ". Can not subset data."
+    )
+  }
+
+  # Subset and return copy
+  adata_subset <- adata[, subset_values]$copy()
+  adata_subset
+}
+
+# Read input data
+cat("Reading input file\n")
+input_mdata <- mudata$read_h5mu(par$input)
+input_adata <- input_mdata$mod[[par$modality]]
+input_matrix <- Matrix::t(
+  get_layer(input_adata, par$input_layer, par$input_var_gene_names)
+)
+
+# Read reference
+cat("Reading reference file\n")
+ref_adata <- mudata$read_h5ad(par$reference, mod = "rna")
+if (!is.null(par$reference_var_input)) {
+  ref_adata <- subset_vars(ref_adata, par$reference_var_input)
+}
+ref_matrix <- Matrix::t(
+  get_layer(ref_adata, par$reference_layer, par$reference_var_gene_names)
+)
+
+# Check overlap genes
+cat("Checking overlap of genes between input and reference file\n")
+common_ens_ids <- intersect(rownames(ref_matrix), rownames(input_matrix))
+if (length(common_ens_ids) < par$input_reference_gene_overlap) {
+  stop(
+    "The intersection of genes between the query and reference is too small.\n",
+    paste0("Expected overlap: ", par$input_reference_gene_overlap),
+    paste0("Detected overlap: ", length(common_ens_ids))
+  )
+}
+
+# Calculate CPU cores
+n_workers <- meta$cpus %||% max(1, parallel::detectCores() - 1)
+
+# Assign target labels
+if (!par$reference_obs_target %in% colnames(ref_adata$obs)) {
+  stop(
+    "Requested to use .obs column '",
+    par$reference_obs_target,
+    "' as target labels, but the column is not available."
+  )
+} else {
+  target_labels <- ref_adata$obs[[par$reference_obs_target]]
+}
+
+cat("Performing SingleR cell type prediction\n")
+predictions <- SingleR(
+  test = input_matrix,
+  ref = ref_matrix,
+  labels = target_labels,
+  clusters = par$input_obs_clusters,
+  genes = "de",
+  de.method = par$de_method,
+  de.n = par$de_n_genes,
+  quantile = par$quantile,
+  fine.tune = par$fine_tune,
+  tune.thresh = par$fine_tuning_threshold,
+  prune = par$prune,
+  num.threads = n_workers
+)
+
+cat("Writing output data\n")
+# Writing output slots
+input_adata$obs[[par$output_obs_predictions]] <-
+  predictions$labels
+input_adata$obs[[par$output_obs_probability]] <-
+  apply(predictions$scores, 1, max)
+input_adata$obs[[par$output_obs_delta_next]] <-
+  predictions$delta.next
+input_adata$obs[[par$output_obs_pruned_predictions]] <-
+  predictions$pruned.labels
+input_adata$obsm[[par$output_obsm_scores]] <-
+  predictions$scores
+# Writing output H5MU
+input_mdata$write(par$output, compression = par$output_compression)
--- a/src/annotate/singler/test.py
+++ b/src/annotate/singler/test.py
@@ -0,0 +1,143 @@
+import sys
+import os
+import scanpy as sc
+import pytest
+import mudata as mu
+from openpipeline_testutils.asserters import assert_annotation_objects_equal
+
+## VIASH START
+meta = {"resources_dir": "resources_test"}
+## VIASH END
+
+input_file = (
+    f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+)
+reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
+
+
+def log_normalize(adata):
+    adata_lognorm = adata.copy()
+    sc.pp.normalize_total(adata_lognorm, target_sum=1e4)
+    sc.pp.log1p(adata_lognorm)
+    adata.layers["log_normalized"] = adata_lognorm.X
+    return adata
+
+
+@pytest.fixture
+def input_path(write_mudata_to_file):
+    mdata = mu.read_h5mu(input_file)
+    adata = mdata.mod["rna"].copy()
+    adata_lognorm = log_normalize(adata)
+    mdata.mod["rna"] = adata_lognorm
+    return write_mudata_to_file(mdata)
+
+
+def test_simple_execution(run_component, random_h5mu_path, input_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_path,
+            "--input_var_gene_names",
+            "gene_symbol",
+            "--reference",
+            reference_file,
+            "--reference_var_input",
+            "highly_variable",
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == [
+        "singler_pred",
+        "singler_probability",
+        "singler_delta_next",
+        "singler_pruned_labels",
+    ]
+    assert list(output_mudata.mod["rna"].obsm.keys()) == ["singler_scores"]
+
+    predictions = output_mudata.mod["rna"].obs["singler_pred"]
+    assert predictions.dtype == "category", (
+        "Calculated predictions should be category dtype"
+    )
+    assert predictions.notna().all(), "None of the predictions should be NA"
+
+    pruned_predictions = output_mudata.mod["rna"].obs["singler_pruned_labels"]
+    assert pruned_predictions.dtype == "category", (
+        "Pruned predictions should be category dtype"
+    )
+    assert pruned_predictions.isna().any(), (
+        "Some of the pruned predictions should be NA"
+    )
+    assert not all(pruned_predictions.isna()), "Not all pruned predictions should be NA"
+
+    delta_next = output_mudata.mod["rna"].obs["singler_delta_next"]
+    assert delta_next.dtype == "float", "Calculated delta next should be float dtype"
+
+    scores = output_mudata.mod["rna"].obsm["singler_scores"]
+    assert scores.dtype == "float", "Calculated scores should be float dtype"
+    assert all(-1 <= value <= 1 for value in scores.flatten()), (
+        ".obsm at singler_scores has values outside the range [-1, 1]"
+    )
+
+
+def test_params(run_component, random_h5mu_path, input_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_path,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--input_reference_gene_overlap",
+            "1000",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--reference_var_input",
+            "highly_variable",
+            "de_n_genes",
+            "600",
+            "--de_method",
+            "wilcox",
+            "--quantile",
+            "0.75",
+            "--fine_tuning_threshold",
+            "0.1",
+            "--prune",
+            "False",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == [
+        "singler_pred",
+        "singler_probability",
+        "singler_delta_next",
+    ]
+    assert list(output_mudata.mod["rna"].obsm.keys()) == ["singler_scores"]
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/svm_annotation/config.vsh.yaml
+++ b/src/annotate/svm_annotation/config.vsh.yaml
@@ -0,0 +1,159 @@
+name: svm_annotation
+namespace: annotate
+scope: "public"
+description: Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs.
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: The layer in the input data to be used for cell type annotation if .X is not to be used. 
+      - name: "--input_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--input_reference_gene_overlap"
+        type: integer
+        default: 100
+        min: 1
+        description: | 
+          The minimum number of genes present in both the reference and query datasets.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
+
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description:
+        required: true
+        description: |
+          Key in .obs attribute of reference modality with cell-type information.
+
+      - name:  "--reference_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--reference_var_input"
+        type: string
+        required: false
+        description: |
+          .var column containing highly variable genes. By default, do not subset genes.
+
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_obs_prediction"
+        type: string
+        default: svm_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: svm_probability
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+  - name: Model arguments
+    description: Model arguments.
+    arguments:
+      - name: "--model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+      - name: "--feature_selection"
+        type: boolean
+        description: "Whether to perform feature selection."
+        default: true
+      - name: "--max_iter"
+        type: integer
+        description: "Maximum number of iterations for the SVM."
+        min: 1
+        default: 5000
+      - name: "--c_reg"
+        type: double
+        description: "Regularization parameter for the SVM."
+        min: 0.0
+        default: 1.0
+      - name: "--class_weight"
+        type: string
+        description: |
+          "Class weights for the SVM.  The `uniform` mode gives all classes a weight  of one. 
+          The `balanced` mode (default) uses the values of y to automatically adjust weights inversely 
+          proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))"
+        choices: ["balanced", "uniform"]
+        default: "balanced"
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  - path: /src/utils/cross_check_genes.py
+  - path: /src/utils/subset_vars.py
+  - path: /src/utils/set_var_index.py
+  
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/
+  - path: /resources_test/pbmc_1k_protein_v3/
+
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - libhdf5-dev
+          - procps
+      - type: python
+        packages:
+          - scikit-learn==1.5.2
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable 
+  - type: nextflow
+    directives:
+      label: [highcpu, highmem, highdisk]
--- a/src/annotate/svm_annotation/script.py
+++ b/src/annotate/svm_annotation/script.py
@@ -0,0 +1,144 @@
+import sys
+import mudata as mu
+import numpy as np
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn import svm
+import pickle
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    "output": "output.h5mu",
+    "modality": "rna",
+    "reference": None,
+    # "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    # "model": None,
+    "model": "resources_test/annotation_test_data/svm_model.pkl",
+    "reference_obs_target": "cell_ontology_class",
+    "reference_var_input": None,
+    "input_layer": None,
+    "reference_layer": None,
+    "max_iter": 5000,
+    "c_reg": 1,
+    "class_weight": "balanced",
+    "output_compression": "gzip",
+    "input_var_gene_names": None,
+    "reference_var_gene_names": "ensemblid",
+    "output_obs_prediction": "svm_pred",
+    "output_obs_probability": "svm_probability",
+    "input_reference_gene_overlap": 100,
+}
+meta = {"resources_dir": "src/utils"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+from cross_check_genes import cross_check_genes
+from subset_vars import subset_vars
+from set_var_index import set_var_index
+
+logger = setup_logger()
+
+
+def main():
+    if (not par["model"] and not par["reference"]) or (
+        par["model"] and par["reference"]
+    ):
+        raise ValueError(
+            "Make sure to provide either 'model' or 'reference', but not both."
+        )
+    logger.info("Reading input data")
+    input_mudata = mu.read_h5mu(par["input"])
+    input_adata = input_mudata.mod[par["modality"]]
+    input_modality = input_adata.copy()
+    input_modality = set_var_index(
+        input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
+
+    if par["model"]:
+        logger.info("Loading a pre-trained model")
+        model = pickle.load(open(par["model"], "rb"))
+        if hasattr(model, "_feature_names_in"):
+            common_genes = cross_check_genes(
+                input_modality.var.index,
+                model._feature_names_in,
+                par["input_reference_gene_overlap"],
+            )
+            if not len(common_genes) == len(model._feature_names_in):
+                raise ValueError("Input dataset does not contain all model features.")
+            input_modality = input_modality[:, common_genes]
+            input_matrix = (
+                input_modality.layers[par["input_layer"]]
+                if par["input_layer"]
+                else input_modality.X
+            )
+
+        else:
+            logger.warning(
+                "Model does not have feature names saved. Could not check overlap of model's features with query genes."
+            )
+
+    elif par["reference"]:
+        logger.info("Reading reference data")
+
+        reference_mudata = mu.read_h5mu(par["reference"])
+        reference_modality = reference_mudata.mod[par["modality"]].copy()
+        reference_modality = set_var_index(
+            reference_modality,
+            par["reference_var_gene_names"],
+            par["sanitize_ensembl_ids"],
+        )
+
+        # subset to HVG if required
+        if par["reference_var_input"]:
+            reference_modality = subset_vars(
+                reference_modality, par["reference_var_input"]
+            )
+
+        # Query and input require the exact same features
+        common_genes = cross_check_genes(
+            input_modality.var.index,
+            reference_modality.var.index,
+            par["input_reference_gene_overlap"],
+        )
+        reference_modality = reference_modality[:, common_genes]
+        input_modality = input_modality[:, common_genes]
+
+        reference_matrix = (
+            reference_modality.layers[par["reference_layer"]]
+            if par["reference_layer"]
+            else reference_modality.X
+        )
+        input_matrix = (
+            input_modality.layers[par["input_layer"]]
+            if par["input_layer"]
+            else input_modality.X
+        )
+
+        logger.info("Training a model...")
+        labels = reference_modality.obs[par["reference_obs_target"]].to_numpy()
+        model = CalibratedClassifierCV(
+            svm.LinearSVC(
+                C=par["c_reg"],
+                max_iter=par["max_iter"],
+                class_weight=par["class_weight"]
+                if not par["class_weight"] == "uniform"
+                else None,
+                dual="auto",
+            )
+        )
+        model.fit(reference_matrix, labels)
+        model._feature_names_in = reference_modality.var.index
+
+    logger.info("Running predictions...")
+    predictions = model.predict(input_matrix)
+    probabilities = np.max(model.predict_proba(input_matrix), axis=1)
+
+    logger.info("Writing output data")
+    input_adata.obs[par["output_obs_prediction"]] = predictions
+    input_adata.obs[par["output_obs_probability"]] = probabilities
+    input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
+
+
+if __name__ == "__main__":
+    main()
--- a/src/annotate/svm_annotation/test.py
+++ b/src/annotate/svm_annotation/test.py
@@ -0,0 +1,184 @@
+import sys
+import os
+import pytest
+import subprocess
+import re
+import mudata as mu
+from openpipeline_testutils.asserters import assert_annotation_objects_equal
+from sklearn import svm
+from sklearn.calibration import CalibratedClassifierCV
+import pickle
+
+## VIASH START
+meta = {"resources_dir": "resources_test"}
+sys.path.append("src/utils")
+## VIASH END
+
+input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
+
+sys.path.append(meta["resources_dir"])
+from cross_check_genes import cross_check_genes
+from set_var_index import set_var_index
+
+
+@pytest.fixture
+def dummy_model(tmp_path):
+    reference_modality = mu.read_h5mu(reference_file).mod["rna"].copy()
+    reference_modality = set_var_index(reference_modality, "ensemblid")
+
+    input_modality = mu.read_h5mu(input_file).mod["rna"].copy()
+    input_modality = set_var_index(input_modality, None)
+
+    common_genes = cross_check_genes(
+        input_modality.var.index, reference_modality.var.index
+    )
+    reference_modality = reference_modality[:, common_genes]
+
+    labels = reference_modality.obs["cell_ontology_class"].to_numpy()
+    model = CalibratedClassifierCV(
+        svm.LinearSVC(
+            max_iter=10,
+            dual="auto",
+        )
+    )
+    model.fit(reference_modality.X, labels)
+    model._feature_names_in = reference_modality.var.index
+
+    model_path = tmp_path / "model.pkl"
+    with open(model_path, "wb") as f:
+        pickle.dump(model, f)
+
+    return model_path
+
+
+def test_simple_execution(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ["svm_pred", "svm_probability"]
+
+    obs_values = output_mudata.mod["rna"].obs["svm_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        "probabilities outside the range [0, 1]"
+    )
+
+
+def test_custom_out_obs_model_params(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference",
+            reference_file,
+            "--reference_var_gene_names",
+            "ensemblid",
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--output_obs_prediction",
+            "dummy_pred",
+            "--output_obs_probability",
+            "dummy_probability",
+            "--max_iter",
+            "1000",
+            "--c_reg",
+            "0.1",
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == [
+        "dummy_pred",
+        "dummy_probability",
+    ]
+
+    obs_values = output_mudata.mod["rna"].obs["dummy_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        "probabilities outside the range [0, 1]"
+    )
+
+
+def test_with_model(run_component, random_h5mu_path, dummy_model):
+    output_file = random_h5mu_path()
+
+    run_component(
+        [
+            "--input",
+            input_file,
+            "--reference_obs_target",
+            "cell_ontology_class",
+            "--model",
+            dummy_model,
+            "--output",
+            output_file,
+        ]
+    )
+
+    assert os.path.exists(output_file), "Output file does not exist"
+
+    input_mudata = mu.read_h5mu(input_file)
+    output_mudata = mu.read_h5mu(output_file)
+
+    assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
+
+    assert list(output_mudata.mod["rna"].obs.keys()) == ["svm_pred", "svm_probability"]
+
+    obs_values = output_mudata.mod["rna"].obs["svm_probability"]
+    assert all(0 <= value <= 1 for value in obs_values), (
+        "probabilities outside the range [0, 1]"
+    )
+
+
+def test_no_model_no_reference_error(run_component, random_h5mu_path):
+    output_file = random_h5mu_path()
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                input_file,
+                "--reference_obs_target",
+                "cell_ontology_class",
+                "--output",
+                output_file,
+            ]
+        )
+    assert re.search(
+        r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))
--- a/src/authors/angela_pisco.yaml
+++ b/src/authors/angela_pisco.yaml
@@ -0,0 +1,14 @@
+name: Angela Oliveira Pisco
+info:
+  role: Contributor
+  links:
+    github: aopisco
+    orcid: "0000-0003-0142-2355"
+    linkedin: aopisco
+  organizations:
+    - name: Insitro
+      href: https://insitro.com
+      role: Director of Computational Biology
+    - name: Open Problems
+      href: https://openproblems.bio
+      role: Core Member
--- a/src/authors/dorien_roosen.yaml
+++ b/src/authors/dorien_roosen.yaml
@@ -0,0 +1,11 @@
+name: Dorien Roosen
+info:
+  role: Core Team Member
+  links:
+    email: dorien@data-intuitive.com
+    github: dorien-er
+    linkedin: dorien-roosen
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Data Scientist
--- a/src/authors/dries_de_maeyer.yaml
+++ b/src/authors/dries_de_maeyer.yaml
@@ -0,0 +1,11 @@
+name: Dries De Maeyer
+info:
+  role: Core Team Member
+  links:
+    email: ddemaeyer@gmail.com
+    github: ddemaeyer
+    linkedin: dries-de-maeyer-b46a814
+  organizations:
+    - name: Janssen Pharmaceuticals
+      href: https://www.janssen.com
+      role: Principal Scientist
--- a/src/authors/dries_schaumont.yaml
+++ b/src/authors/dries_schaumont.yaml
@@ -0,0 +1,12 @@
+name: Dries Schaumont
+info:
+  role: Core Team Member
+  links:
+    email: dries@data-intuitive.com
+    github: DriesSchaumont
+    orcid: "0000-0002-4389-0440"
+    linkedin: dries-schaumont
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Data Scientist
--- a/src/authors/elizabeth_mlynarski.yaml
+++ b/src/authors/elizabeth_mlynarski.yaml
@@ -0,0 +1,6 @@
+name: Elizabeth Mlynarski
+info:
+  role: Contributor
+  organizations:
+    - name: Janssen R&D US
+      role: Principal Scientist Computational Genomics
--- a/src/authors/isabelle_bergiers.yaml
+++ b/src/authors/isabelle_bergiers.yaml
@@ -0,0 +1,10 @@
+name: Isabelle Bergiers 
+info:
+  role: Contributor
+  links:
+    github: Isabelle-b
+    orcid: 0000-0001-9622-7960
+  organizations:
+    - name: Janssen Pharmaceuticals
+      href: https://www.janssen.com
+      role: Scientist OMICS Technology
--- a/src/authors/jakub_majercik.yaml
+++ b/src/authors/jakub_majercik.yaml
@@ -0,0 +1,11 @@
+name: Jakub Majercik
+info:
+  role: Contributor
+  links:
+    email: jakub@data-intuitive.com
+    github: jakubmajercik
+    linkedin: jakubmajercik
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Bioinformatics Engineer
--- a/src/authors/kai_waldrant.yaml
+++ b/src/authors/kai_waldrant.yaml
@@ -0,0 +1,15 @@
+name: Kai Waldrant
+info:
+  role: Contributor
+  links:
+    email: kai@data-intuitive.com
+    github: KaiWaldrant
+    orcid: "0009-0003-8555-1361"
+    linkedin: kaiwaldrant
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Bioinformatician
+    - name: Open Problems
+      href: https://openproblems.bio
+      role: Contributor
--- a/src/authors/luke_zappia.yaml
+++ b/src/authors/luke_zappia.yaml
@@ -0,0 +1,12 @@
+name: Luke Zappia
+info:
+  role: Contributor
+  links:
+    email: luke@data-intuitive.com
+    github: lazappi
+    orcid: 0000-0001-7744-8565
+    linkedin: lazappi
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Data Science Engineer
--- a/src/authors/malte_luecken.yaml
+++ b/src/authors/malte_luecken.yaml
@@ -0,0 +1,16 @@
+name: Malte D. Luecken
+info:
+  role: Core Team Member
+  links:
+    email: malte.luecken@helmholtz-muenchen.de
+    github: LuckyMD
+    orcid: "0000-0001-7464-7921"
+    linkedin: malte-l%C3%BCcken-b8b21049
+    twitter: MDLuecken
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: Group Leader
+    - name: Open Problems
+      href: https://openproblems.bio
+      role: Core Member
--- a/src/authors/marijke_van_moerbeke.yaml
+++ b/src/authors/marijke_van_moerbeke.yaml
@@ -0,0 +1,11 @@
+name: Marijke Van Moerbeke
+info:
+  role: Contributor
+  links:
+    github: mvanmoerbeke
+    orcid: 0000-0002-3097-5621
+    linkedin: marijke-van-moerbeke-84303a34
+  organizations:
+    - name: OpenAnalytics
+      href: https://www.openanalytics.eu
+      role: Statistical Consultant
--- a/src/authors/matthias_beyens.yaml
+++ b/src/authors/matthias_beyens.yaml
@@ -0,0 +1,12 @@
+name: Matthias Beyens
+info:
+  role: Contributor
+  links:
+    github: MatthiasBeyens
+    orcid: "0000-0003-3304-0706"
+    email: matthias.beyens@gmail.com
+    linkedin: mbeyens
+  organizations:
+    - name: Janssen Pharmaceuticals
+      href: https://www.janssen.com
+      role: Principal Scientist
--- a/src/authors/mauro_saporita.yaml
+++ b/src/authors/mauro_saporita.yaml
@@ -0,0 +1,11 @@
+name: Mauro Saporita
+info:
+  role: Contributor
+  links:
+    email: maurosaporita@gmail.com
+    github: mauro-saporita
+    linkedin: mauro-saporita-930b06a5
+  organizations:
+    - name: Ardigen
+      href: https://ardigen.com
+      role: Lead Nextflow Developer
--- a/src/authors/povilas_gibas.yaml
+++ b/src/authors/povilas_gibas.yaml
@@ -0,0 +1,11 @@
+name: Povilas Gibas
+info:
+  role: Contributor
+  links:
+    email: povilasgibas@gmail.com
+    github: PoGibas
+    linkedin: povilas-gibas
+  organizations:
+    - name: Ardigen
+      href: https://ardigen.com
+      role: Bioinformatician
--- a/src/authors/robrecht_cannoodt.yaml
+++ b/src/authors/robrecht_cannoodt.yaml
@@ -0,0 +1,15 @@
+name: Robrecht Cannoodt
+info:
+  role: Core Team Member
+  links:
+    email: robrecht@data-intuitive.com
+    github: rcannood
+    orcid: "0000-0003-3641-729X"
+    linkedin: robrechtcannoodt
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Data Science Engineer
+    - name: Open Problems
+      href: https://openproblems.bio
+      role: Core Member
--- a/src/authors/samuel_d_souza.yaml
+++ b/src/authors/samuel_d_souza.yaml
@@ -0,0 +1,10 @@
+name: Samuel D'Souza
+info:
+  role: Contributor
+  links:
+    github: srdsam
+    linkedin: samuel-d-souza-887023150/
+  organizations:
+    - name: Chan Zuckerberg Biohub
+      href: https://www.czbiohub.org
+      role: Data Engineer
--- a/src/authors/sarah_ouologuem.yaml
+++ b/src/authors/sarah_ouologuem.yaml
@@ -0,0 +1,10 @@
+name: Sarah Ouologuem
+info:
+  role: Contributor
+  links:
+    github: SarahOuologuem
+    orcid: 0009-0005-3398-1700
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: Student Assistant
--- a/src/authors/toni_verbeiren.yaml
+++ b/src/authors/toni_verbeiren.yaml
@@ -0,0 +1,10 @@
+name: Toni Verbeiren
+info:
+  role: Core Team Member
+  links:
+    github: tverbeiren
+    linkedin: verbeiren
+  organizations:
+  - name: Data Intuitive
+    href: https://www.data-intuitive.com
+    role: Data Scientist and CEO
--- a/src/authors/vladimir_shitov.yaml
+++ b/src/authors/vladimir_shitov.yaml
@@ -0,0 +1,12 @@
+name: Vladimir Shitov
+info:
+  role: Contributor
+  links:
+    email: vladimir.shitov@helmholtz-muenchen.de
+    github: vladimirshitov
+    orcid: "0000-0002-1960-8812"
+    linkedin: vladimir-shitov-9a659513b
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: PhD Candidate
--- a/src/authors/weiwei_schultz.yaml
+++ b/src/authors/weiwei_schultz.yaml
@@ -0,0 +1,6 @@
+name: Weiwei Schultz
+info:
+  role: Contributor
+  organizations:
+    - name: Janssen R&D US
+      role: Associate Director Data Sciences
--- a/src/authors/xichen_wu.yaml
+++ b/src/authors/xichen_wu.yaml
@@ -0,0 +1,11 @@
+name: Xichen Wu
+info:
+  role: Contributor
+  links:
+    github: wxicu
+    linkedin: xichen-wu
+    orcid: 0009-0008-2168-4508
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: Student Assistant
--- a/src/base/h5_compression_argument.yaml
+++ b/src/base/h5_compression_argument.yaml
@@ -0,0 +1,9 @@
+arguments:
+  - name: "--output_compression"
+    description: |
+      Compression format to use for the output AnnData and/or Mudata H5 files.
+      By default no compression is applied.
+    type: string
+    choices: ["gzip", "lzf"]
+    required: false
+    example: "gzip"
--- a/src/base/requirements/anndata.yaml
+++ b/src/base/requirements/anndata.yaml
@@ -0,0 +1,4 @@
+packages:
+  - anndata~=0.12.16
+  - awkward #Required for reading VDJ data stored in AIRR format
+  - scipy~=1.17.1 # Exclude scipy 1.17.0 because https://github.com/scverse/anndata/issues/339
--- a/src/base/requirements/anndata_mudata.yaml
+++ b/src/base/requirements/anndata_mudata.yaml
@@ -0,0 +1,5 @@
+__merge__: [/src/base/requirements/anndata.yaml, .]
+packages:
+  - mudata~=0.3.8
+script: |
+    exec("try:\n  import zarr; from importlib.metadata import version\nexcept ModuleNotFoundError:\n  exit(0)\nelse:  assert int(version(\"zarr\").partition(\".\")[0]) > 2")
--- a/src/base/requirements/openpipeline_testutils.yaml
+++ b/src/base/requirements/openpipeline_testutils.yaml
@@ -0,0 +1,2 @@
+github:
+  - openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils
--- a/src/base/requirements/python_test_setup.yaml
+++ b/src/base/requirements/python_test_setup.yaml
@@ -0,0 +1,8 @@
+test_setup:
+  - type: apt
+    packages:
+      - git
+  - type: python
+    __merge__:
+      - /src/base/requirements/viashpy.yaml
+      - /src/base/requirements/openpipeline_testutils.yaml
--- a/src/base/requirements/scanpy.yaml
+++ b/src/base/requirements/scanpy.yaml
@@ -0,0 +1,2 @@
+packages:
+  - scanpy~=1.11.4
--- a/src/base/requirements/testworkflows_setup.yaml
+++ b/src/base/requirements/testworkflows_setup.yaml
@@ -0,0 +1,10 @@
+setup:
+  - type: apt
+    packages:
+      - procps
+      - git
+  - type: python
+    __merge__:
+      - /src/base/requirements/anndata_mudata.yaml
+      - /src/base/requirements/openpipeline_testutils.yaml
+      - /src/base/requirements/viashpy.yaml
--- a/Show More
+++ b/Show More