Build branch openpipeline/processing-workflows-quantile-filtering with version processing-workflows-quantile-filtering to openpipeline on branch processing-workflows-quantile-filtering (38f5f83d)

Build pipeline: openpipelines-bio.openpipeline.processing-workflows-quantijd49m

Source commit: 38f5f83d11

Source message: fix tests
This commit is contained in:
CI
2026-06-09 18:32:44 +00:00
commit 5b8fa21d13
2535 changed files with 1322536 additions and 0 deletions

3
.git-blame-ignore-revs Normal file
View File

@@ -0,0 +1,3 @@
# Apply ruff as linter for python code
9e87b864f699c371b444b592a19e610a3c9d3286

2
.gitattributes vendored Normal file
View File

@@ -0,0 +1,2 @@
src/mapping/bd_rhapsody*/*.cwl linguist-generated
src/query/cellxgene_census linguist-generated

41
.gitignore vendored Normal file
View File

@@ -0,0 +1,41 @@
# Jupyter notebooks
.ipynb_checkpoints
# pycache
*__pycache__*
.nfs*
# R related
.Rhistory
*.Rproj
.Rproj.user
# Python virtual environments
.venv
# temporary files related
temp
# NextFlow
work/
.nextflow.log
flowchart.*
.nextflow*
out/
# Macos
.DS_Store
# viash related
.viash_log*
log.txt
check_results/
out/
output*
output_log/
resources_test
/viash_tools/
# vscode
.vscode/launch.json
.vscode/settings.json

3
.lintr Normal file
View File

@@ -0,0 +1,3 @@
exclusions: list(
"README.qmd"
)

24
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,24 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.14.0
hooks:
- id: ruff-check
args: [ --fix ]
- id: ruff-format
- repo: local
hooks:
- id: run_styler
name: run_styler
language: r
description: style files with {styler}
entry: "Rscript -e 'styler::style_file(commandArgs(TRUE))'"
files: '(\.[rR]profile|\.[rR]|\.[rR]md|\.[rR]nw|\.[qQ]md)$'
additional_dependencies:
- styler
- knitr
- repo: https://github.com/lorenzwalthert/precommit
rev: v0.4.3.9015
hooks:
- id: lintr

626
.pylintrc Normal file
View File

@@ -0,0 +1,626 @@
[MAIN]
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Load and enable all available extensions. Use --list-extensions to see a list
# all available extensions.
#enable-all-extensions=
# In error mode, messages with a category besides ERROR or FATAL are
# suppressed, and no reports are done by default. Error mode is compatible with
# disabling specific errors.
#errors-only=
# Always return a 0 (non-error) status code, even if lint errors are found.
# This is primarily useful in continuous integration scripts.
#exit-zero=
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-allow-list=
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
# for backward compatibility.)
extension-pkg-whitelist=
# Return non-zero exit code if any of these messages/categories are detected,
# even if score is above --fail-under value. Syntax same as enable. Messages
# specified are enabled, while categories only check already-enabled messages.
fail-on=
# Specify a score threshold under which the program will exit with error.
fail-under=10
# Interpret the stdin as a python script, whose filename needs to be passed as
# the module_or_package argument.
#from-stdin=
# Files or directories to be skipped. They should be base names, not paths.
ignore=CVS
# Add files or directories matching the regular expressions patterns to the
# ignore-list. The regex matches against paths and can be in Posix or Windows
# format. Because '\' represents the directory delimiter on Windows systems, it
# can't be used as an escape character.
ignore-paths=
# Files or directories matching the regular expression patterns are skipped.
# The regex matches against base names, not paths. The default value ignores
# Emacs file locks
ignore-patterns=^\.#
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use, and will cap the count on Windows to
# avoid hangs.
jobs=1
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# Minimum Python version to use for version dependent checks. Will default to
# the version used to run pylint.
py-version=3.10
# Discover python modules and packages in the file system subtree.
recursive=no
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# In verbose mode, extra non-checker-related info will be displayed.
#verbose=
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=snake_case
# Regular expression matching correct argument names. Overrides argument-
# naming-style. If left empty, argument names will be checked with the set
# naming style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=snake_case
# Regular expression matching correct attribute names. Overrides attr-naming-
# style. If left empty, attribute names will be checked with the set naming
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Bad variable names regexes, separated by a comma. If names match any regex,
# they will always be refused
bad-names-rgxs=
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style. If left empty, class attribute names will be checked
# with the set naming style.
#class-attribute-rgx=
# Naming style matching correct class constant names.
class-const-naming-style=UPPER_CASE
# Regular expression matching correct class constant names. Overrides class-
# const-naming-style. If left empty, class constant names will be checked with
# the set naming style.
#class-const-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style. If left empty, class names will be checked with the set naming style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=UPPER_CASE
# Regular expression matching correct constant names. Overrides const-naming-
# style. If left empty, constant names will be checked with the set naming
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style. If left empty, function names will be checked with the set
# naming style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,
j,
k,
ex,
Run,
_
# Good variable names regexes, separated by a comma. If names match any regex,
# they will always be accepted
good-names-rgxs=
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style. If left empty, inline iteration names will be checked
# with the set naming style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style. If left empty, method names will be checked with the set naming style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=snake_case
# Regular expression matching correct module names. Overrides module-naming-
# style. If left empty, module names will be checked with the set naming style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Regular expression matching correct type variable names. If left empty, type
# variable names will be checked with the set naming style.
#typevar-rgx=
# Naming style matching correct variable names.
variable-naming-style=snake_case
# Regular expression matching correct variable names. Overrides variable-
# naming-style. If left empty, variable names will be checked with the set
# naming style.
#variable-rgx=
[CLASSES]
# Warn about protected attribute access inside special methods
check-protected-access-in-special-methods=no
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,
_fields,
_replace,
_source,
_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls
[DESIGN]
# List of regular expressions of class ancestor names to ignore when counting
# public methods (see R0903)
exclude-too-few-public-methods=
# List of qualified class names to ignore when counting class parents (see
# R0901)
ignored-parents=
# Maximum number of arguments for function / method.
max-args=5
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=12
# Maximum number of locals for function / method body.
max-locals=15
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=6
# Maximum number of statements in function / method body.
max-statements=50
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
[EXCEPTIONS]
# Exceptions that will emit a warning when caught.
overgeneral-exceptions=BaseException,
Exception
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=100
# Maximum number of lines in a module.
max-module-lines=1000
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=
# Output a graph (.gv or any supported image format) of external dependencies
# to the given file (report RP0402 must not be disabled).
ext-import-graph=
# Output a graph (.gv or any supported image format) of all (i.e. internal and
# external) dependencies to the given file (report RP0402 must not be
# disabled).
import-graph=
# Output a graph (.gv or any supported image format) of internal dependencies
# to the given file (report RP0402 must not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[LOGGING]
# The type of string formatting that logging methods do. `old` means using %
# formatting, `new` is for `{}` formatting.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
# UNDEFINED.
confidence=HIGH,
CONTROL_FLOW,
INFERENCE,
INFERENCE_FAILURE,
UNDEFINED
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then re-enable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
line-too-long,
missing-module-docstring,
redefined-outer-name
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[METHOD_ARGS]
# List of qualified names (i.e., library.method) which require a timeout
# parameter e.g. 'requests.api.get,requests.api.post'
timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
XXX,
TODO
# Regular expression of note tags to take in consideration.
notes-rgx=
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit,argparse.parse_error
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'fatal', 'error', 'warning', 'refactor',
# 'convention', and 'info' which contain the number of messages in each
# category, as well as 'statement' which is the total number of statements
# analyzed. This score is used by the global evaluation report (RP0004).
evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
#output-format=
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[SIMILARITIES]
# Comments are removed from the similarity computation
ignore-comments=yes
# Docstrings are removed from the similarity computation
ignore-docstrings=yes
# Imports are removed from the similarity computation
ignore-imports=yes
# Signatures are removed from the similarity computation
ignore-signatures=yes
# Minimum lines number of a similarity.
min-similarity-lines=4
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. Available dictionaries: en_AG (hunspell), en_AU
# (hunspell), en_BS (hunspell), en_BW (hunspell), en_BZ (hunspell), en_CA
# (hunspell), en_DK (hunspell), en_GB (hunspell), en_GH (hunspell), en_HK
# (hunspell), en_IE (hunspell), en_IN (hunspell), en_JM (hunspell), en_MW
# (hunspell), en_NA (hunspell), en_NG (hunspell), en_NZ (hunspell), en_PH
# (hunspell), en_SG (hunspell), en_TT (hunspell), en_US (hunspell), en_ZA
# (hunspell), en_ZM (hunspell), en_ZW (hunspell).
spelling-dict=
# List of comma separated words that should be considered directives if they
# appear at the beginning of a comment and should not be checked.
spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[STRING]
# This flag controls whether inconsistent-quotes generates a warning when the
# character used as a quote delimiter is used inconsistently within a module.
check-quote-consistency=no
# This flag controls whether the implicit-str-concat should generate a warning
# on implicit string concatenation in sequences defined over several lines.
check-str-concat-over-line-jumps=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of symbolic message names to ignore for Mixin members.
ignored-checks-for-mixins=no-member,
not-async-context-manager,
not-context-manager,
attribute-defined-outside-init
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# Regex pattern to define which classes are considered mixins.
mixin-class-rgx=.*[Mm]ixin
# List of decorators that change the signature of a decorated function.
signature-mutators=
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of names allowed to shadow builtins
allowed-redefined-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io

1874
CHANGELOG.md Normal file

File diff suppressed because it is too large Load Diff

132
CODE_OF_CONDUCT.md Normal file
View File

@@ -0,0 +1,132 @@
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
[INSERT CONTACT METHOD].
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].
[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 OpenPipelines
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

258
README.md Normal file
View File

@@ -0,0 +1,258 @@
# OpenPipeline
Extensible single cell analysis pipelines for reproducible and
large-scale single cell processing using Viash and Nextflow.
[![ViashHub](https://img.shields.io/badge/ViashHub-openpipeline-7a4baa.svg)](https://www.viash-hub.com/packages/openpipeline)
[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2Fopenpipeline-blue.svg)](https://github.com/openpipelines-bio/openpipeline)
[![GitHub
License](https://img.shields.io/github/license/openpipelines-bio/openpipeline.svg)](https://github.com/openpipelines-bio/openpipeline/blob/main/LICENSE)
[![GitHub
Issues](https://img.shields.io/github/issues/openpipelines-bio/openpipeline.svg)](https://github.com/openpipelines-bio/openpipeline/issues)
[![Viash
version](https://img.shields.io/badge/Viash-v0.9.3-blue.svg)](https://viash.io)
## Documentation
Please find more in-depth documentation on [the
website](https://openpipelines.bio/).
## Functionality Overview
Openpipelines execute a list of predefined tasks. These descrete steps
are also provided as standalone components that can be executed
individually, with a standardized interface. This is especially useful
when a particular step wraps a tool that you do not necessarily always
need to execute in a workflow context.
In terms of workflows, the following functionality is provided:
- Demultiplexing: conversion of raw sequencing data to FASTQ objects.
- [Ingestion](https://openpipelines.bio/fundamentals/architecture.html#sec-ingestion):
Read mapping and generating a count matrix.
- [Single sample
processing](https://openpipelines.bio/fundamentals/architecture.html#sec-single-sample):
cell filtering and doublet detection.
- [Multisample
processing](https://openpipelines.bio/fundamentals/architecture.html#sec-multisample-processing):
Count transformation, normalization, QC metric calulations.
- [Integration](https://openpipelines.bio/fundamentals/architecture.html#sec-intergration):
Clustering, integration and batch correction using single and
multimodal methods.
- Downstream analysis workflows
``` mermaid lang="mermaid"
flowchart LR
demultiplexing["Step 1: Demultiplexing"]
ingestion["Step 2: Ingestion"]
process_samples["Step 3: Process Samples"]
integration["Step 4: Integration"]
downstream["Step 5: Downstream"]
demultiplexing-->ingestion-->process_samples-->integration-->downstream
```
## Guided execution using Viash Hub (CLI and Seqera cloud)
Openpipelines is now available on [Viash
Hub](https://www.viash-hub.com/packages/openpipeline/latest). Viash Hub
provides a list of components and workflows, together with a graphical
interface that guides you through the steps of running a workflow or
standalone component. Intstructions are provided for using a local viash
or nextflow executable (requires using a linux based OS), but connecting
to a Seqera cloud instance is also supported.
## Execution using the nextflow executable
Executing a workflow is a bit more involved and requires familiarity
with the command line interface (CLI).
### Setup
In order to use the workflows in this package on your local computer,
youll need to do the following:
- Install [nextflow](https://www.nextflow.io/docs/latest/install.html)
- Install a nextflow compatible executor. This workflow provides a
profile for [docker](https://docs.docker.com/get-started/).
### Location of the workflow scripts
Nextflow workflow scripts, schemas and configuration files can be found
in the `target/nextflow` folder. On the `main` branch however, only the
source code that needs to be build into the functionning workflows and
components can be found. Instead, please refer to the `main_build`
branch or any of the tags to find the `target` folders. Components and
workflows are organized into namespaces, which can be nested. Workflows
are located at `target/nextflow/workflows`, while components that
execute individual workflow steps are
A reference of workflows and modules is also provided in the
[documentation](https://openpipelines.bio/components/).
### Retrieving a list of a workflow parameters
A list of workflows arguments can be consulted in multiple ways:
- On [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest)
- In the [reference
documentation](https://openpipelines.bio/components/)
- The config YAML file lists the argument for each workflow and
component
- In the `target/nextflow` folder, a nextflow schema JSON file
(`nextflow_schema.json`) is provided next to each workflow `.nf` file.
- Using nextflow on the CLI:
``` bash
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 \
-main-script target/nextflow/workflows/ingestion/demux/main.nf \
--help
```
### Resource usage tuning
Nextflows labels can be used to specify the amount of resources a
process can use. This workflow uses the following labels for CPU, memory
and disk:
- `lowmem`, `lowmem`, `midmem`, `highmem`, `veryhighmem`
- `lowcpu`, `lowcpu`, `midcpu`, `highcpu`, `veryhighcpu`
- `lowdisk`, `middisk`, `highdisk`, `veryhighdisk`
The defaults for these labels can be found at
`src/workflows/utils/labels.config`. Nextflow checks that the specified
resources for a process do not exceed what is available on the machine
and will not start if it does. Create your own config file to tune the
labels to your needs, for example:
// Resource labels
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 8 }
withLabel: midcpu { cpus = 16 }
withLabel: highcpu { cpus = 16 }
withLabel: verylowmem { memory = 4.GB }
withLabel: lowmem { memory = 8.GB }
withLabel: midmem { memory = 16.GB }
withLabel: highmem { memory = 32.GB }
When starting nextflow using the CLI, you can use `-c` to provide the
file to nextflow and overwrite the defaults.
### Demultiplexing example
Here, generating FASTQ files from raw sequencing data is demonstrated,
based on data generated using 10X genomics protocols. However, BD
genomics data is also supported by Openpipeline. If you wish to try it
out yourself, test data is available at
`s3://openpipelines-data/cellranger_tiny_bcl/bcl`.
``` bash
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 \
-main-script target/nextflow/workflows/ingestion/demux/main.nf \
-c "<path to resource config file>" \
-profile docker \
--publish_dir "<path to output directory>" \
--id "cellranger_tiny_bcl" \
--input "s3://openpipelines-data/cellranger_tiny_bcl/bcl" \
--sample_sheet "s3://openpipelines-data/cellranger_tiny_bcl/bcl/sample_sheet.csv" \
--demultiplexer "mkfastq"
```
### Mapping and read counting
FASTQ files can be mapped to a reference genome and the resulting mapped
reads can be counted in order to generate a count matrix. Both
`BD Rhapsody` and `Cell Ranger` are supported. Here, we demonstrate
using Cell Ranger multi on test data available at
`s3://openpipelines-data/10x_5k_anticmv`.
In order to facilitate passing multiple argument values, the parameters
can be specified using a YAML file.
``` yaml
input:
- "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_*.fastq.gz"
- "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_*.fastq.gz"
- "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_*.fastq.gz"
gex_reference: "s3://openpipelines-data/reference_gencodev41_chr1/reference_cellranger.tar.gz"
vdj_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"
feature_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/feature_reference.csv"
library_id:
- "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset"
- "5k_human_antiCMV_T_TBNK_connect_AB_subset"
- "5k_human_antiCMV_T_TBNK_connect_VDJ_subset"
library_type:
- "Gene Expression"
- "Antibody Capture"
- "VDJ"
```
You can pass this file to nextflow using `-params-file`
``` bash
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 \
-main-script target/nextflow/workflows/ingestion/cellranger_multi/main.nf \
-c "<path to resource config file>" \
-profile docker \
-params-file "<path to your parameter YAML file>" \
--publish_dir "<path to output directory>"
```
### Filtering, normalization, clustering, dimensionality reduction and QC calculations (w/o integration)
Once you have an MuData object for each of your samples, you can process
it into a multisample file that is ready for integration and other
downstream analyses. This can be done using the `process_samples`
workflow. Here is an example, but please keep in mind that the exact
parameters that need to be provided differ depending on you data. A lot
of functionality for this pipeline can be customized, including the name
of the output slots where data is being stored.
``` yaml
param_list:
- id: "sample_1"
input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
rna_min_counts: 2
- id: "sample_2"
input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
rna_min_counts: 1
rna_max_counts: 1000000
rna_min_genes_per_cell: 1
rna_max_genes_per_cell: 1000000
rna_min_cells_per_gene: 1
rna_min_fraction_mito: 0.0
rna_max_fraction_mito: 1.0
```
In order to provide multiple samples to the pipeline, `param_list` is
used. Using `param_list` it is possible to specify arguments per sample.
However, it is still possible to define arguments for all samples
together by listing those outside the `param_list` block.
``` bash
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 \
-main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
-c "<path to resource config file>" \
-profile docker \
-params-file "<path to your parameter YAML file>"
--publish_dir "<path to output directory>"
```
## Executing standalone components using the Viash executable
Another option to execute individual modules on the CLI is to use
`viash run`. All you need to do is download viash, clone the
Openpipeline repository and point viash to a config file. However, keep
in mind that using `viash run` for workflows is currently not supported.
Please see `viash run --help` for more information on how to use the
command, but here is an example:
``` bash
viash run --engine docker src/mapping/cellranger_multi/config.vsh.yaml --help
```

205
README.qmd Normal file
View File

@@ -0,0 +1,205 @@
---
format: gfm
---
```{r setup, include=FALSE}
project <- yaml::read_yaml("_viash.yaml")
license <- paste0(project$links$repository, "/blob/main/LICENSE")
```
# OpenPipeline
Extensible single cell analysis pipelines for reproducible and large-scale single cell processing using Viash and Nextflow.
[![ViashHub](https://img.shields.io/badge/ViashHub-`r project$name`-7a4baa.svg)](https://www.viash-hub.com/packages/`r project$name`)
[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2F`r project$name`-blue.svg)](`r project$links$repository`)
[![GitHub License](https://img.shields.io/github/license/`r project$organization`/`r project$name`.svg)](`r license`)
[![GitHub Issues](https://img.shields.io/github/issues/`r project$organization`/`r project$name`.svg)](`r project$links$issue_tracker`)
[![Viash version](https://img.shields.io/badge/Viash-v`r gsub("-", "--", project$viash_version)`-blue.svg)](https://viash.io)
## Documentation
Please find more in-depth documentation on [the website](https://openpipelines.bio/).
## Functionality Overview
Openpipelines execute a list of predefined tasks. These descrete steps are also provided as standalone components that can be executed individually, with a standardized interface. This is especially useful when a particular step wraps a tool that you do not necessarily always need to execute in a workflow context.
In terms of workflows, the following functionality is provided:
* Demultiplexing: conversion of raw sequencing data to FASTQ objects.
* [Ingestion](https://openpipelines.bio/fundamentals/architecture.html#sec-ingestion): Read mapping and generating a count matrix.
* [Single sample processing](https://openpipelines.bio/fundamentals/architecture.html#sec-single-sample): cell filtering and doublet detection.
* [Multisample processing](https://openpipelines.bio/fundamentals/architecture.html#sec-multisample-processing): Count transformation, normalization, QC metric calulations.
* [Integration](https://openpipelines.bio/fundamentals/architecture.html#sec-intergration): Clustering, integration and batch correction using single and multimodal methods.
* Downstream analysis workflows
```{mermaid lang='mermaid'}
flowchart LR
demultiplexing["Step 1: Demultiplexing"]
ingestion["Step 2: Ingestion"]
process_samples["Step 3: Process Samples"]
integration["Step 4: Integration"]
downstream["Step 5: Downstream"]
demultiplexing-->ingestion-->process_samples-->integration-->downstream
```
## Guided execution using Viash Hub (CLI and Seqera cloud)
Openpipelines is now available on [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest). Viash Hub provides a list of components and workflows, together with a graphical interface that guides you through the steps of running a workflow or standalone component. Intstructions are provided for using a local viash or nextflow executable (requires using a linux based OS), but connecting to a Seqera cloud instance is also supported.
## Execution using the nextflow executable
Executing a workflow is a bit more involved and requires familiarity with the command line interface (CLI).
### Setup
In order to use the workflows in this package on your local computer, you'll need to do the following:
* Install [nextflow](https://www.nextflow.io/docs/latest/install.html)
* Install a nextflow compatible executor. This workflow provides a profile for [docker](https://docs.docker.com/get-started/).
### Location of the workflow scripts
Nextflow workflow scripts, schema's and configuration files can be found in the `target/nextflow` folder.
On the `main` branch however, only the source code that needs to be build into the functionning workflows and components can be found.
Instead, please refer to the `main_build` branch or any of the tags to find the `target` folders.
Components and workflows are organized into namespaces, which can be nested. Workflows are located at `target/nextflow/workflows`,
while components that execute individual workflow steps are
A reference of workflows and modules is also provided in the [documentation](https://openpipelines.bio/components/).
### Retrieving a list of a workflow parameters
A list of workflows arguments can be consulted in multiple ways:
* On [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest)
* In the [reference documentation](https://openpipelines.bio/components/)
* The config YAML file lists the argument for each workflow and component
* In the `target/nextflow` folder, a nextflow schema JSON file (`nextflow_schema.json`) is provided next to each workflow `.nf` file.
* Using nextflow on the CLI:
```bash
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 \
-main-script target/nextflow/workflows/ingestion/demux/main.nf \
--help
```
### Resource usage tuning
Nextflow's labels can be used to specify the amount of resources a process can use. This workflow uses the following labels for CPU, memory and disk:
* `lowmem`, `lowmem`, `midmem`, `highmem`, `veryhighmem`
* `lowcpu`, `lowcpu`, `midcpu`, `highcpu`, `veryhighcpu`
* `lowdisk`, `middisk`, `highdisk`, `veryhighdisk`
The defaults for these labels can be found at `src/workflows/utils/labels.config`. Nextflow checks that the specified resources for a process do not exceed what is available on the machine and will not start if it does. Create your own config file to tune the labels to your needs, for example:
```
// Resource labels
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 8 }
withLabel: midcpu { cpus = 16 }
withLabel: highcpu { cpus = 16 }
withLabel: verylowmem { memory = 4.GB }
withLabel: lowmem { memory = 8.GB }
withLabel: midmem { memory = 16.GB }
withLabel: highmem { memory = 32.GB }
```
When starting nextflow using the CLI, you can use `-c` to provide the file to nextflow and overwrite the defaults.
### Demultiplexing example
Here, generating FASTQ files from raw sequencing data is demonstrated, based on data generated using 10X genomic's protocols. However, BD genomics data is also supported by Openpipeline. If you wish to try it out yourself, test data is available at `s3://openpipelines-data/cellranger_tiny_bcl/bcl`.
```bash
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 \
-main-script target/nextflow/workflows/ingestion/demux/main.nf \
-c "<path to resource config file>" \
-profile docker \
--publish_dir "<path to output directory>" \
--id "cellranger_tiny_bcl" \
--input "s3://openpipelines-data/cellranger_tiny_bcl/bcl" \
--sample_sheet "s3://openpipelines-data/cellranger_tiny_bcl/bcl/sample_sheet.csv" \
--demultiplexer "mkfastq"
```
### Mapping and read counting
FASTQ files can be mapped to a reference genome and the resulting mapped reads can be counted in order to generate a count matrix. Both `BD Rhapsody` and `Cell Ranger` are supported. Here, we demonstrate using Cell Ranger multi on test data available at `s3://openpipelines-data/10x_5k_anticmv`.
In order to facilitate passing multiple argument values, the parameters can be specified using a YAML file.
```yaml
input:
- "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_*.fastq.gz"
- "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_*.fastq.gz"
- "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_*.fastq.gz"
gex_reference: "s3://openpipelines-data/reference_gencodev41_chr1/reference_cellranger.tar.gz"
vdj_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"
feature_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/feature_reference.csv"
library_id:
- "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset"
- "5k_human_antiCMV_T_TBNK_connect_AB_subset"
- "5k_human_antiCMV_T_TBNK_connect_VDJ_subset"
library_type:
- "Gene Expression"
- "Antibody Capture"
- "VDJ"
```
You can pass this file to nextflow using `-params-file`
```bash
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 \
-main-script target/nextflow/workflows/ingestion/cellranger_multi/main.nf \
-c "<path to resource config file>" \
-profile docker \
-params-file "<path to your parameter YAML file>" \
--publish_dir "<path to output directory>"
```
### Filtering, normalization, clustering, dimensionality reduction and QC calculations (w/o integration)
Once you have an MuData object for each of your samples, you can process it into a multisample file that is ready for integration and other downstream analyses. This can be done using the `process_samples` workflow. Here is an example, but please keep in mind that the exact parameters that need to be provided differ depending on you data. A lot of functionality for this pipeline can be customized, including the name of the output slots where data is being stored.
```yaml
param_list:
- id: "sample_1"
input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
rna_min_counts: 2
- id: "sample_2"
input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
rna_min_counts: 1
rna_max_counts: 1000000
rna_min_genes_per_cell: 1
rna_max_genes_per_cell: 1000000
rna_min_cells_per_gene: 1
rna_min_fraction_mito: 0.0
rna_max_fraction_mito: 1.0
```
In order to provide multiple samples to the pipeline, `param_list` is used. Using `param_list` it is possible to specify arguments per sample. However, it is still possible to define arguments for all samples together by listing those outside the `param_list` block.
```bash
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 \
-main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
-c "<path to resource config file>" \
-profile docker \
-params-file "<path to your parameter YAML file>"
--publish_dir "<path to output directory>"
```
## Executing standalone components using the Viash executable
Another option to execute individual modules on the CLI is to use `viash run`. All you need to do is download viash, clone the Openpipeline repository and point viash to a config file. However, keep in mind that using `viash run` for workflows is currently not supported. Please see `viash run --help` for more information on how to use the command, but here is an example:
```bash
viash run --engine docker src/mapping/cellranger_multi/config.vsh.yaml --help
```

40
_viash.yaml Normal file
View File

@@ -0,0 +1,40 @@
viash_version: 0.9.7
source: src
target: target
# Note: this causes the docker images to be renamed
name: openpipeline
organization: vsh
summary: |
Best-practice workflows for single-cell multi-omics analyses.
description: |
OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).
In terms of workflows, the following has been made available, but keep in mind that
individual tools and functionality can be executed as standalone components as well.
* Demultiplexing: conversion of raw sequencing data to FASTQ objects.
* Ingestion: Read mapping and generating a count matrix.
* Single sample processing: cell filtering and doublet detection.
* Multisample processing: Count transformation, normalization, QC metric calulations.
* Integration: Clustering, integration and batch correction using single and multimodal methods.
* Downstream analysis workflows
license: MIT
keywords: [single-cell, multimodal]
links:
repository: https://github.com/openpipelines-bio/openpipeline
docker_registry: ghcr.io
homepage: https://openpipelines.bio
documentation: https://openpipelines.bio/fundamentals
issue_tracker: https://github.com/openpipelines-bio/openpipeline/issues
info:
test_resources:
- type: s3
path: s3://openpipelines-data
dest: resources_test
nextflow_labels_ci:
- path: src/workflows/utils/labels_ci.config
description: Adds the correct memory and CPU labels when running on the Viash Hub CI.
config_mods: |
.resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}
.runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
version: processing-workflows-quantile-filtering

5340
images/concepts/fig.svg Normal file

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 389 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 8.9 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 9.8 KiB

View File

@@ -0,0 +1,9 @@
#!/bin/bash
# so let's do it separately
rm images/concepts/fig_*.svg
for id in cell modality_rna modality_adt modality_vdj modality_atac workflow_multiomics_rna_singlesample workflow_multiomics_rna_multisample workflow_multiomics_adt_singlesample workflow_multiomics_adt_multisample; do
inkscape --export-type="svg" --export-id="$id" --export-id-only images/concepts/fig.svg
svgo images/concepts/fig_${id}.svg
done

5
main.nf Normal file
View File

@@ -0,0 +1,5 @@
nextflow.enable.dsl=2
workflow {
print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.")
}

22
nextflow.config Normal file
View File

@@ -0,0 +1,22 @@
// template nextflow.config for nested workflows
manifest {
nextflowVersion = '!>=20.12.1-edge'
}
// TODO 1: unquote and adapt `rootDir` according to relative path within project
// params {
// rootDir = "$projectDir/../.."
// }
//
// workflowDir = "${params.rootDir}/workflows"
// targetDir = "${params.rootDir}/target/nextflow"
// TODO 2: insert custom imports here
// TODO 3: unquote
// docker {
// runOptions = "-v \$(realpath ${params.rootDir}):\$(realpath ${params.rootDir})"
// }

View File

@@ -0,0 +1,214 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_fixed
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
# dataset page:
# https://www.10xgenomics.com/datasets/mixture-of-healthy-and-cancer-ffpe-tissues-dissociated-using-miltenyi-ffpe-tissue-dissociation-kit-multiplexed-samples-4-probe-barcodes-1-standard
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.1.0/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R2_001.fastq.gz"
# download feature reference
feature_ref="$raw_dir/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv"
if [[ ! -f "$feature_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref"
fi
# download probe set
probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv"
if [[ ! -f "$probe_set" ]]; then
wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv" -O "$probe_set"
fi
sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set"
probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv"
if [[ ! -f "$probe_set_corrected" ]]; then
reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf"
cat "$probe_set" | while read line || [[ -n $line ]];
do
echo "Line: $line"
old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' )
echo "Old ID: $old_id"
if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then
echo "Just writing line"
printf "%s\n" "$line" >> "$probe_set_corrected"
else
gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;)
if [ ! -z "$gtf_lookup" ]; then
echo "Found hit"
new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
echo "New ID: $new_id"
new_line=${line/"$old_id"/"$new_id"}
printf "%s\n" "$new_line" >> "$probe_set_corrected"
else
echo "Did not find hit"
fi
fi
done
fi
# Input FASTA:
# >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
# Output FASTA:
# >chr1 1
input_fastq="$HOME/.cache/openpipeline/GRCh38.primary_assembly.genome.fa.gz"
fasta_modified="$TMPDIR/GRCh38.primary_assembly.genome.modified.fa"
if [[ ! -f "$input_fastq" ]]; then
wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" -O "$input_fastq"
fi
zcat "$input_fastq" \
| sed -E 's/^>(\S+).*/>\1 \1/' \
| sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
| sed -E 's/^>MT />chrM /' \
> "$fasta_modified"
pigz --fast "$fasta_modified"
fasta_modified="$fasta_modified.gz"
# Input GTF:
# ... gene_id "ENSG00000223972.5"; ...
# Output GTF:
# ... gene_id "ENSG00000223972"; gene_version "5"; ...
input_gtf="$HOME/.cache/openpipeline/gencode.v41.annotation.gtf.gz"
gtf_modified="$TMPDIR/gencode.v41.annotation.gtf.modified.gtf"
if [[ ! -f "$input_gtf" ]]; then
wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" -O "$input_gtf"
fi
REGEX="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)"
zcat "$input_gtf" \
| sed -E 's/gene_id "'"$REGEX"'";/gene_id "\1"; gene_version "\3";/' \
| sed -E 's/transcript_id "'"$REGEX"'";/transcript_id "\1"; transcript_version "\3";/' \
| sed -E 's/exon_id "'"$REGEX"'";/exon_id "\1"; exon_version "\3";/' \
> "$gtf_modified"
pigz --fast "$gtf_modified"
gtf_modified="$gtf_modified.gz"
final_genome="$HOME/.cache/openpipeline/GRCh38.cellranger.genome.fa.gz"
if [ ! -f "$final_genome" ]; then
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
-profile docker \
-resume \
--id "GRCh38" \
--genome_fasta "$fasta_modified" \
--transcriptome_gtf "$gtf_modified" \
--target "cellranger" \
--output_fasta "reference.fa.gz" \
--output_gtf "reference.gtf.gz" \
--output_cellranger "GRCh38.cellranger.genome.fa.gz" \
--publish_dir "$HOME/.cache/openpipeline/"
fi
# Run mapping pipeline
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- ${orig_sample_id}_subset
library_type:
- "Gene Expression"
library_lanes:
- "any"
probe_set: "$probe_set_corrected"
gex_reference: "$genome_tar"
feature_reference: "$feature_ref"
probe_barcode_ids:
- BC001
- BC002
- BC003
- BC004
sample_ids:
- Liver_BC1
- Ovarian_BC2
- Colorectal_BC3
- Pancreas_BC4
gex_generate_bam: false
sample_force_cells:
- 5000
- -1
- -1
- -1
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config \
--publish_dir "$OUT/processed"
mkdir -p "${OUT}/processed"
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config \
--publish_dir "${OUT}_v10/processed"

View File

@@ -0,0 +1,169 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_4plex_dtc
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
# dataset page:
# https://www.10xgenomics.com/datasets/40k-mixture-of-dissociated-tumor-cells-from-3-donors-stained-with-totalseqc-antibodies
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/4plex_DTC_kidney_lung_breast_TotalSeqC_fastqs"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.2.0/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 100000 "$input" | gzip > "$output"
fi
}
orig_sample_id="4plex_DTC_kidney_lung_breast_TotalSeqC"
seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L002_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L002_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L002_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L002_R2_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L003_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L003_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L003_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L003_R2_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L004_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L004_R2_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L002_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L002_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L002_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L002_R2_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L003_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L003_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L003_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L003_R2_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L004_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L004_R2_001.fastq.gz"
# download feature reference
feature_ref="$raw_dir/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv"
if [[ ! -f "$feature_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref"
fi
# download probe set
probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A.csv"
if [[ ! -f "$probe_set" ]]; then
wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A.csv" -O "$probe_set"
fi
sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set"
probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A-corrected.csv"
if [[ ! -f "$probe_set_corrected" ]]; then
reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf"
cat "$probe_set" | while read line || [[ -n $line ]];
do
echo "Line: $line"
old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' )
echo "Old ID: $old_id"
if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then
echo "Just writing line"
printf "%s\n" "$line" >> "$probe_set_corrected"
else
gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;)
if [ ! -z "$gtf_lookup" ]; then
echo "Found hit"
new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
echo "New ID: $new_id"
new_line=${line/"$old_id"/"$new_id"}
printf "%s\n" "$new_line" >> "$probe_set_corrected"
else
echo "Did not find hit"
fi
fi
done
fi
# Run mapping pipeline
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- ${orig_sample_id}_gex_subset
- ${orig_sample_id}_ab_subset
library_type:
- "Gene Expression"
- "Antibody Capture"
library_lanes:
- "any"
- "any"
probe_set: "$probe_set_corrected"
gex_reference: "$genome_tar"
feature_reference: "$feature_ref"
probe_barcode_ids:
- BC001+AB001
- BC002+AB002
- BC003+AB003
- BC004+AB004
sample_ids:
- Kidney_Cancer1_BC1_AB1
- Kidney_Cancer2_BC2_AB2
- Breast_Cancer_BC3_AB3
- Lung_Cancer_BC4_AB4
gex_generate_bam: false
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
--publish_dir "$OUT/processed" \
-c src/workflows/utils/labels_ci.config
mkdir -p "${OUT}_v10/processed"
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
--publish_dir "${OUT}_v10/processed" \
-c src/workflows/utils/labels_ci.config

View File

@@ -0,0 +1,232 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_anticmv
OUT=resources_test/$ID
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# dataset page:
# https://www.10xgenomics.com/resources/datasets/integrated-gex-totalseqc-and-tcr-analysis-of-connect-generated-library-from-5k-cmv-t-cells-2-standard
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/5k_human_antiCMV_T_TBNK_connect_Multiplex"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="5k_human_antiCMV_T_TBNK_connect"
seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R2_001.fastq.gz"
seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R1_001.fastq.gz"
seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R2_001.fastq.gz"
seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R2_001.fastq.gz"
# download immune panel fasta if needed
feature_reference="$raw_dir/feature_reference.csv"
if [[ ! -f "$feature_reference" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_count_feature_reference.csv" -O "$feature_reference"
fi
# download vdj reference if needed
vdj_ref="$raw_dir/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"
if [[ ! -f "$vdj_ref" ]]; then
wget "https://cf.10xgenomics.com/supp/cell-vdj/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" -O "$vdj_ref"
fi
# Run mapping pipeline
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- "${orig_sample_id}_GEX_1_subset"
- "${orig_sample_id}_AB_subset"
- "${orig_sample_id}_VDJ_subset"
library_type:
- "Gene Expression"
- "Antibody Capture"
- "VDJ"
gex_reference: "$genome_tar"
vdj_reference: "$vdj_ref"
feature_reference: "$feature_reference"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-r 3.0.0 \
-resume \
--publish_dir "$OUT/processed" \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config \
-c src/workflows/utils/errorstrat_ignore.config
mkdir -p "${OUT}_v10/processed"
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
--publish_dir "${OUT}_v10/processed" \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config \
-c src/workflows/utils/errorstrat_ignore.config
# Convert to h5mu
cat > /tmp/params.yaml << HERE
id: "$orig_sample_id"
input: "$OUT/processed/10x_5k_anticmv.cellranger_multi.output"
publish_dir: "$OUT/"
output: "*.h5mu"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config
mv "$OUT/0.h5mu" "$OUT/${orig_sample_id}.h5mu"
# run qc workflow
cat > /tmp/params.yaml << HERE
id: "$ID"
input: "$OUT/$orig_sample_id.h5mu"
var_name_mitochondrial_genes: mitochondrial
var_name_ribosomal_genes: ribosomal
publish_dir: "$OUT/"
output: "${orig_sample_id}_qc.h5mu"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/workflows/qc/qc/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config
# Run full pipeline
cat > /tmp/params.yaml << HERE
id: "$ID"
input: "$OUT/${orig_sample_id}_qc.h5mu"
publish_dir: "$OUT/"
output: "${orig_sample_id}_mms.h5mu"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config
# create fastqc directory
fastqc_dir="$OUT/fastqc"
mkdir -p "$fastqc_dir"
./target/executable/qc/fastqc/fastqc \
--input "$raw_dir" \
--mode "dir" \
--output "$fastqc_dir"
# Create a test dataset for the Custom modality
# by just labeling the AB as custom
feat_ref_name=$(basename $feature_reference)
sed -e 's/Antibody Capture/Custom/g' "$feature_reference" > "/tmp/custom_${feat_ref_name}"
cat > /tmp/params_custom.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- "${orig_sample_id}_GEX_1_subset"
- "${orig_sample_id}_AB_subset"
- "${orig_sample_id}_VDJ_subset"
library_type:
- "Gene Expression"
- "Custom"
- "VDJ"
gex_reference: "$genome_tar"
feature_reference: "/tmp/custom_${feat_ref_name}"
vdj_reference: "$vdj_ref"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params_custom.yaml \
-c src/workflows/utils/labels.config \
-c src/workflows/utils/errorstrat_ignore.config \
--publish_dir "${OUT}/processed_with_custom"
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params_custom.yaml \
-c src/workflows/utils/labels.config \
-c src/workflows/utils/errorstrat_ignore.config \
--publish_dir "${OUT}_v10/processed_with_custom"

View File

@@ -0,0 +1,140 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_beam
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# dataset page:
# https://www.10xgenomics.com/datasets/5k-human-a0201-b0702-pbmcs-beam-t-2-standard
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/5k_human_A0201_B0702_PBMCs_BEAM_T"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="beamt_human_A0201_B0702_pbmc"
seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R2_001.fastq.gz"
seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R2_001.fastq.gz"
seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R2_001.fastq.gz"
# download feature reference
feature_ref="$raw_dir/beamt_human_A0201_B0702_pbmc_feature_reference.csv"
if [[ ! -f "$feature_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_count_feature_reference.csv" -O "$feature_ref"
fi
# download vdj reference if needed
vdj_ref="$raw_dir/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz"
if [[ ! -f "$vdj_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" -O "$vdj_ref"
fi
# Run mapping pipeline
# TODO: Also include conversion to h5mu
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- "${orig_sample_id}_gex_subset"
- "${orig_sample_id}_vdj_subset"
- "${orig_sample_id}_ag_subset"
library_type:
- "Gene Expression"
- "VDJ-T"
- "Antigen Capture"
gex_reference: "$genome_tar"
feature_reference: "$feature_ref"
vdj_reference: "$vdj_ref"
control_id:
- negative_control_A0201
- negative_control_B0702
mhc_allele:
- "HLA-A*02:01"
- "HLA-B*07:02"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config \
--publish_dir "$OUT/processed"
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config \
--publish_dir "${OUT}_v10/processed"
# Create h5mu
cat > /tmp/params.yaml << HERE
id: "$ID"
input: "$OUT/processed/$ID.cellranger_multi.output"
publish_dir: "$OUT/"
output: "*.h5mu"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config
mv "$OUT/0.h5mu" "$OUT/${orig_sample_id}.h5mu"

View File

@@ -0,0 +1,151 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_lung_crispr
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# dataset page:
# https://www.10xgenomics.com/resources/datasets/5-k-a-549-lung-carcinoma-cells-no-treatment-transduced-with-a-crispr-pool-3-1-standard-6-0-0
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="SC3_v3_NextGem_DI_CRISPR_A549_5K"
seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R2_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R2_001.fastq.gz"
# download crispr feature reference
crispr_ref="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv"
if [[ ! -f "$crisp_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv" -O "$crispr_ref"
fi
crispr_ref_adjusted="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference_corrected.csv"
reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
cat "$crispr_ref" | while read line || [[ -n $line ]];
do
echo "Line: $line"
old_id=$( printf "%s\n" "$line" | awk -F',' '{print $7}' )
echo "Old ID: $old_id"
if [ "$old_id" = "Non-Targeting" ] || [ "$old_id" = "target_gene_id" ] ; then
echo "Just writing line"
printf "%s\n" "$line" >> "$crispr_ref_adjusted"
else
gtf_lookup=$(zgrep "$old_id" "$reference_gtf" || test $? = 1;)
if [ ! -z "$gtf_lookup" ]; then
echo "Found hit"
new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
echo "New ID: $new_id"
new_line=${line/"$old_id"/"$new_id"}
printf "%s\n" "$new_line" >> "$crispr_ref_adjusted"
else
echo "Did not find hit"
fi
fi
done
# Run mapping pipeline
# As of Cell Ranger v10, gex_secondary_analysis is required
# to generate CRISPR output.
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- "${orig_sample_id}_gex_subset"
- "${orig_sample_id}_crispr_subset"
library_type:
- "Gene Expression"
- "CRISPR Guide Capture"
gex_reference: "$genome_tar"
feature_reference: "$crispr_ref_adjusted"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config \
--publish_dir "${OUT}/processed"
mkdir -p "${OUT}_v10/processed"
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config \
--publish_dir "${OUT}_v10/processed" \
--gex_secondary_analysis
# Create h5mu
cat > /tmp/params.yaml << HERE
id: "$ID"
input: "$OUT/processed/10x_5k_lung_crispr.cellranger_multi.output"
output: "*.h5mu"
HERE
nextflow \
run openpipelines-bio/openpipeline \
-r 3.0.0 \
-main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config \
--publish_dir "$OUT/"
mv "${OUT}/0.h5mu" "${OUT}/$orig_sample_id.h5mu"

View File

@@ -0,0 +1,170 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
ID=annotation_test_data
OUT=resources_test/$ID/
# ideally, this would be a versioned pipeline run
[ -d "$OUT" ] || mkdir -p "$OUT"
# Download Tabula Sapiens Blood reference h5ad from https://doi.org/10.5281/zenodo.7587774
wget "https://zenodo.org/record/7587774/files/TS_Blood_filtered.h5ad?download=1" -O "${OUT}/tmp_TS_Blood_filtered.h5ad"
# Download Tabula Sapiens Blood pretrained model from https://doi.org/10.5281/zenodo.7580707
wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz?download=1" -O "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
# Download PopV specific CL ontology files - needed for OnClass
# OUT_ONTOLOGY="${OUT}/ontology"
# [ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY"
# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.obo \
# -O "${OUT_ONTOLOGY}/cl.obo"
# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology \
# -O "${OUT_ONTOLOGY}/cl.ontology"
# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology.nlp.emb \
# -O "${OUT_ONTOLOGY}/cl.ontology.nlp.emb"
# Process Tabula Sapiens Blood reference h5ad
# (Select one individual and 100 cells per cell type)
# normalize and log1p transform data
# Add treatment and disease columns
python <<HEREDOC
import anndata as ad
import scanpy as sc
import numpy as np
# Read in data
ref_adata = ad.read_h5ad("${OUT}/tmp_TS_Blood_filtered.h5ad")
sub_ref_adata = ref_adata[ref_adata.obs["donor_assay"] == "TSP14_10x 3' v3"]
n=100
s=sub_ref_adata.obs.groupby('cell_ontology_class').cell_ontology_class.transform('count')
sub_ref_adata_final = sub_ref_adata[sub_ref_adata.obs[s>=n].groupby('cell_ontology_class').head(n).index]
# Normalize and log1p transform data
data_for_scanpy = ad.AnnData(X=sub_ref_adata_final.X)
sc.pp.normalize_total(data_for_scanpy, target_sum=10000)
sc.pp.log1p(
data_for_scanpy,
base=None,
layer=None,
copy=False,
)
sub_ref_adata_final.layers["log_normalized"] = data_for_scanpy.X
# Add treatment and disease columns
n_cells = sub_ref_adata_final.n_obs
treatment = np.random.choice(["ctrl", "stim"], size=n_cells, p=[0.5, 0.5])
disease = np.random.choice(["healthy", "diseased"], size=n_cells, p=[0.5, 0.5])
sub_ref_adata_final.obs["treatment"] = treatment
sub_ref_adata_final.obs["disease"] = disease
# Strip raw slot - not needed for annotation and causes compatibility issues between AnnData/MuData versions
sub_ref_adata_final = sub_ref_adata_final.copy()
sub_ref_adata_final.raw = None
# Write out data
sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
HEREDOC
echo "> Converting to h5mu"
viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \
--input "${OUT}/TS_Blood_filtered.h5ad" \
--output "${OUT}/TS_Blood_filtered.h5mu" \
--modality "rna"
rm "${OUT}/tmp_TS_Blood_filtered.h5ad"
echo "> Downloading pretrained CellTypist model and sample test data"
wget https://celltypist.cog.sanger.ac.uk/models/Pan_Immune_CellTypist/v2/Immune_All_Low.pkl \
-O "${OUT}/celltypist_model_Immune_All_Low.pkl"
wget https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_2000_cells.h5ad \
-O "${OUT}/demo_2000_cells.h5ad"
viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \
--input "${OUT}/demo_2000_cells.h5ad" \
--output "${OUT}/demo_2000_cells.h5mu" \
--modality "rna"
echo "> Fetching OnClass data and models"
OUT_ONTOLOGY="${OUT}/ontology"
[ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY"
wget https://figshare.com/ndownloader/files/28394466 -O "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz"
tar -xzvf "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz" -C "${OUT_ONTOLOGY}" --strip-components=2
rm "${OUT_ONTOLOGY}/allen.ontology"
rm "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz"
wget https://figshare.com/ndownloader/files/28394541 -O "${OUT}/OnClass_models.tar.gz"
tar -xzvf "${OUT}/OnClass_models.tar.gz" -C "${OUT}" --strip-components=1
rm "${OUT}/OnClass_models.tar.gz"
rm "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
find "${OUT}/Pretrained_model" ! -name "example_file_model*" -type f -exec rm -f {} +
mv "${OUT}/Pretrained_model" "${OUT}/onclass_model"
echo "> Creating simple SCVI model"
viash run src/integrate/scvi/config.vsh.yaml --engine docker -- \
--input "${OUT}/TS_Blood_filtered.h5mu" \
--obs_batch "donor_id" \
--var_gene_names "ensemblid" \
--output "${OUT}/scvi_output.h5mu" \
--output_model "${OUT}/scvi_model" \
--max_epochs 5 \
--n_obs_min_count 10 \
--n_var_min_count 10
echo "> Creating SCVI model with covariates"
viash run src/integrate/scvi/config.vsh.yaml --engine docker -- \
--input "${OUT}/TS_Blood_filtered.h5mu" \
--obs_batch "donor_id" \
--var_gene_names "ensemblid" \
--obs_categorical_covariate "assay" \
--obs_categorical_covariate "donor_assay" \
--output "${OUT}/scvi_covariate_output.h5mu" \
--output_model "${OUT}/scvi_covariate_model" \
--max_epochs 5 \
--n_obs_min_count 10 \
--n_var_min_count 10
echo "> Creating simple SCANVI model"
viash run src/annotate/scanvi/config.vsh.yaml --engine docker -- \
--input "${OUT}/TS_Blood_filtered.h5mu" \
--var_gene_names "ensemblid" \
--obs_labels "cell_ontology_class" \
--scvi_model "${OUT}/scvi_model" \
--output "${OUT}/scanvi_output.h5mu" \
--output_model "${OUT}/scanvi_model" \
--max_epochs 5
echo "> Creating SCANVI model with covariates"
viash run src/annotate/scanvi/config.vsh.yaml --engine docker -- \
--input "${OUT}/TS_Blood_filtered.h5mu" \
--var_gene_names "ensemblid" \
--obs_labels "cell_ontology_class" \
--scvi_model "${OUT}/scvi_covariate_model" \
--output "${OUT}/scanvi_covariate_output.h5mu" \
--output_model "${OUT}/scanvi_covariate_model" \
--max_epochs 5
rm "${OUT}/scanvi_output.h5mu"
rm "${OUT}/scanvi_covariate_output.h5mu"
rm "${OUT}/scvi_output.h5mu"
rm "${OUT}/scvi_covariate_output.h5mu"
rm -r "${OUT}/Pretrained_model/"
echo "> Creating Pseudobulk Data for DGEA"
viash run src/differential_expression/create_pseudobulk/config.vsh.yaml --engine docker -- \
--input "${OUT}/TS_Blood_filtered.h5mu" \
--obs_grouping "cell_type" \
--obs_sample_conditions "donor_id" \
--obs_sample_conditions "treatment" \
--obs_sample_conditions "disease" \
--min_num_cells_per_sample 5 \
--output "${OUT}/TS_Blood_filtered_pseudobulk.h5mu"

View File

@@ -0,0 +1,8 @@
#!/bin/bash
set -eo pipefail
aws s3 sync --profile di "resources_test" "s3://openpipelines-data" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun
id=cellranger_tiny_fastq
aws s3 sync --profile di "resources_test/$id" "s3://openpipelines-data/$id" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun

View File

@@ -0,0 +1,144 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=bdrhap_5kjrt
OUT=resources_test/$ID
n_threads=30
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1"
genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/12WTA-ABC-SMK-EB-5kJRT"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/12WTA-ABC-SMK-EB-5kJRT.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
genome_dir="$raw_dir/temp_reference_gencodev41_chr1"
if [[ ! -d "$genome_dir" ]]; then
echo "> Untarring genome"
mkdir -p "$genome_dir"
tar -xvf "$genome_tar" -C "$genome_dir"
fi
# process WTA fastq files
# map to chr1, subsample chr1 reads
mapping_dir="$raw_dir/temp_mapping_chr_1"
if [[ ! -f "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" ]]; then
echo "> Processing 12WTA_S1_L432_R[12]_001.fastq.gz"
mkdir -p "$mapping_dir"
# MUST USE A STAR THAT IS COMPATIBLE WITH BD RHAPSODY
# For the cwl pipeline 1.9.1, 2.5.2b should work.
echo "star"
docker run --rm -i \
-v "`pwd`/$OUT:`pwd`/$OUT" \
-v "$tar_dir:$tar_dir" \
-w `pwd` bdgenomics/rhapsody:1.10.1 \
STAR \
--runThreadN "$n_threads" \
--genomeDir "$genome_dir" \
--readFilesIn "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" \
--runRNGseed 100 \
--outFileNamePrefix "$mapping_dir/" \
--readFilesCommand "gzip -d -k -c" \
--clip3pAdapterSeq "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
--outFilterMatchNmin "25" \
--quantTranscriptomeBan "Singleend" # Prohibit mapping of one side of the read
# chown to current user before removing mapping dir
docker run --rm -i -v "`pwd`/$OUT:`pwd`/$OUT" -w `pwd` bdgenomics/rhapsody:1.10.1 \
chown "$(id -u):$(id -g)" --silent --recursive "$mapping_dir/"
echo "samtools"
samtools view -F 260 "$mapping_dir/Aligned.out.sam" > "$mapping_dir/primary_aligned_reads.sam"
echo "cut"
cut -f 1 "$mapping_dir/primary_aligned_reads.sam" | sort | uniq > "$mapping_dir/mapped_reads.txt"
head -500000 "$mapping_dir/mapped_reads.txt" > "$mapping_dir/mapped_reads_subset.txt"
echo "seqkit"
seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R1_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq"
seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq"
# rm -r "$mapping_dir"
# rm -r "$genome_dir"
fi
# subsample other files
smk_r1_file="$raw_dir/12SMK_S1_L432_R1_001_subset.fastq.gz"
if [[ ! -f "$smk_r1_file" ]]; then
echo "> Processing `basename $smk_r1_file`"
seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R1_001.fastq.gz" | gzip > "$smk_r1_file"
fi
smk_r2_file="$raw_dir/12SMK_S1_L432_R2_001_subset.fastq.gz"
if [[ ! -f "$smk_r2_file" ]]; then
echo "> Processing `basename $smk_r2_file`"
seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R2_001.fastq.gz" | gzip > "$smk_r2_file"
fi
abc_r1_file="$raw_dir/12ABC_S1_L432_R1_001_subset.fastq.gz"
if [[ ! -f "$abc_r1_file" ]]; then
echo "> Processing `basename $abc_r1_file`"
seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R1_001.fastq.gz" | gzip > "$abc_r1_file"
fi
abc_r2_file="$raw_dir/12ABC_S1_L432_R2_001_subset.fastq.gz"
if [[ ! -f "$abc_r2_file" ]]; then
echo "> Processing `basename $abc_r2_file`"
seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R2_001.fastq.gz" | gzip > "$abc_r2_file"
fi
wta_r1_file="$raw_dir/12WTA_S1_L432_R1_001_subset.fastq.gz"
if [[ ! -f "$wta_r1_file" ]]; then
echo "> Processing `basename $wta_r1_file`"
gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" > "$wta_r1_file"
fi
wta_r2_file="$raw_dir/12WTA_S1_L432_R2_001_subset.fastq.gz"
if [[ ! -f "$wta_r2_file" ]]; then
echo "> Processing `basename $wta_r2_file`"
gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq" > "$wta_r2_file"
fi
# copy immune panel fasta
fasta_file="$raw_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta"
if [[ ! -f "$fasta_file" ]]; then
cp "$tar_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta" "$fasta_file"
fi
genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz"
nextflow run . \
-main-script target/nextflow/workflows/ingestion/bd_rhapsody/main.nf \
-resume \
-profile docker,mount_temp \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/errorstrat_ignore.config \
--reads "$wta_r1_file;$wta_r2_file;$abc_r1_file;$abc_r2_file;$smk_r1_file;$smk_r2_file" \
--reference_archive "$genome_tar" \
--abseq_reference "$fasta_file" \
--sample_tags_version "hs" \
--tag_names "1-Jurkat;2-Ramos;3-THP1" \
--output_raw "output_raw" \
--output "output.h5mu" \
--output_state state.yaml \
--cell_calling_data "mRNA" \
--exact_cell_count 4000 \
--generate_bam true \
--publish_dir "$OUT/processed"

View File

@@ -0,0 +1,72 @@
#!/bin/bash
set -eo pipefail
# TODO: we should turn this into viash components
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=bdrhap_vdj
OUT=resources_test/$ID
n_threads=30
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/VDJDemo"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/VDJDemo/VDJDemo.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
# subset fastq files
for sample_id in RhapVDJDemo-BCR_S1_L001_R1_001 RhapVDJDemo-BCR_S1_L001_R2_001 RhapVDJDemo-mRNA_S5_L001_R1_001 RhapVDJDemo-mRNA_S5_L001_R2_001 RhapVDJDemo-TCR_S3_L001_R1_001 RhapVDJDemo-TCR_S3_L001_R2_001; do
subset_file="$raw_dir/${sample_id}_subset.fastq.gz"
if [[ ! -f "$subset_file" ]]; then
echo "> Processing $sample_id"
seqkit head -n 300000 "$tar_dir/$sample_id.fastq.gz" | gzip > "$subset_file"
fi
unset subset_file
done
# copy immune panel fasta
fasta_file="$raw_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta"
if [[ ! -f "$fasta_file" ]]; then
cp "$tar_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta" "$fasta_file"
fi
# create params file
cat > /tmp/params.yaml << HERE
param_list:
- id: "targeted_vdj"
input: "$raw_dir/RhapVDJDemo-*_S*_L001_R[12]_001_subset.fastq.gz"
mode: targeted
reference: "$fasta_file"
publish_dir: "$OUT/processed"
putative_cell_call: "mRNA"
vdj_version: human
HERE
# run bd rhapsody pipeline
nextflow \
run . \
-main-script src/workflows/ingestion/bd_rhapsody/main.nf \
-resume \
-profile docker,mount_temp \
-with-trace work/trace.txt \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config \
-c src/workflows/utils/errorstrat_ignore.config

View File

@@ -0,0 +1,74 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
# settings
ID=cellranger_atac_tiny_bcl
OUT="resources_test/$ID/"
DIR="$OUT"
REFERENCE_DIR=resources_test/reference_gencodev41_chr1
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up EXIT
viash ns build -q "download_file|cellranger_atac_mkfastq|build_cellranger_arc_reference|cellranger_atac_count" -p docker --setup cb
# download bcl data
if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
mkdir -p "$OUT/bcl"
# download tar gz
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-1.0.0.tar.gz \
--output "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz"
# untar
tar -xf "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz" \
--strip-components=1 \
-C "$OUT/bcl"
# remove tar
rm "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz"
# Download the layout file. It contains info about the samples (1 in this case) and lanes
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-simple-1.0.0.csv \
--output "${OUT}/bcl/layout.csv"
# download sample sheet
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-samplesheet-1.0.0.csv \
--output "${OUT}/bcl/sample_sheet.csv"
fi
if [ ! -d "${OUT}/fastqs" ]; then
mkdir -p "$OUT/fastqs"
target/docker/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq \
--input "${OUT}/bcl" \
--csv "${OUT}/bcl/layout.csv" \
--output "${OUT}/fastqs"
fi
# Create count matrices
if [ ! -d "${OUT}/counts" ]; then
mkdir -p "$OUT/counts"
target/docker/mapping/cellranger_atac_count/cellranger_atac_count \
--input "${OUT}/fastqs/HJN3KBCX2/test_sample/" \
--reference "${REFERENCE_DIR}/reference_cellranger.tar.gz" \
--output "${OUT}/counts"
fi

View File

@@ -0,0 +1,112 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
# settings
ID=cellranger_tiny_bcl
OUT="resources_test/$ID/"
DIR="$OUT"
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up EXIT
# download bcl data
if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
mkdir -p "$OUT/bcl"
# download tar gz
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-1.2.0.tar.gz \
--output "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz"
# untar
tar -xf "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz" \
--strip-components=1 \
-C "$OUT/bcl"
# remove tar
rm "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz"
# download sample sheet
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-simple-1.2.0.csv \
--output "${OUT}/bcl/sample_sheet.csv"
fi
if [ ! -d "${OUT}/fastqs" ]; then
mkdir -p "$OUT/fastqs"
target/docker/demux/cellranger_mkfastq/cellranger_mkfastq \
--input "${OUT}/bcl" \
--sample_sheet "${OUT}/bcl/sample_sheet.csv" \
--output "${OUT}/fastqs"
fi
# bcl-convert requires a v2 sample sheet
# bcl-convert is a bit more strict concerning filter files being present or not.
# We make a copy and make the necessary adaptations.
# We are using the tiny bcl dataset provided by Illumina:
# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq
# Unfortunately,
# 1. the sample sheet delivered with it does not work with bcl-convert (v1 of the format)
# 2. 2 filter files are missing from the run directory that bcl-convert requires to run
#
# We worked around this by
# 1. Manually editing a sample sheet file suited for bcl-convert (format v2)
# 2. Adding a filter file
#
# The filter file is a binary file, we just created an empty file use that.
# bcl-convert might complain about it, but at least something is written out.
# An alternative is to use a filter file from a different project. This also generates
# a warning, but the fastq ouput files contain reads. The drawback is that those filter files
# are generally above 100MB in size.
#
# TODO: Check if a (binary) filter file can be generated that is small but works.
if [ ! -f "${OUT}/bcl2/sample_sheet.csv" ]; then
mkdir "${OUT}/bcl2/"
cp -r ${OUT}/bcl/* "${OUT}/bcl2/"
cat > "${OUT}/bcl2/sample_sheet.csv" << HERE
[Header],,,,,,,,,
FileFormatVersion,2,,,,,,
RunName,hiseq_test,,,,,,
InstrumentPlatform,NextSeq,,,,,,
IndexOrientation,Forward,,,,,,
,,,,,,,,,
[Reads],,,,,,,,,
Read1Cycles,26,,,,,,,,,
Read2Cycles,98,,,,,,
,,,,,,,,,
[Sequencing_Settings],,,,,,,
,,,,,,,
[BCLConvert_Settings],,,,,,,
SoftwareVersion,3.8.4,,,,,,
NoLaneSplitting,true,,,,,,
FastqCompressionFormat,gzip,,,,,,
,,,,,,,,,
[BCLConvert_Data],,,,,,,
Sample_ID,index,,,,,,
s1,GGTTTACT,,,,,,
,,,,,,,
[Cloud_Settings],,,,,,,
GeneratedVersion,1.3.0.202111171923,,,,,,
,,,,,,,
[Cloud_Data],,,,,,,
Sample_ID,ProjectName,LibraryName,LibraryPrepKitName,IndexAdapterKitName,I7_Index_ID,Sample_Name,Description,Instrument,Type
s1,p1,s1_SI-P03-C9,,,IDT01,SI-P03-C9,s1,NextSeq,HighOutput_75cycles
HERE
touch "${OUT}/bcl2/Data/Intensities/BaseCalls/L001/s_1_1101.filter"
fi

View File

@@ -0,0 +1,118 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
# settings
ID=cellranger_tiny_fastq
OUT="resources_test/$ID/"
DIR="$OUT"
# download cellranger tar gz
cellranger_tar_gz="${OUT}/temp_cellranger-6.1.2.tar.gz"
if [ ! -f "$cellranger_tar_gz" ]; then
echo "Download Cell Ranger 6.1.2 manually first!"
exit 1
fi
# untar fastqs
cellranger_tiny_fastq="${OUT}/cellranger_tiny_fastq"
if [ ! -f "${cellranger_tiny_fastq}/tinygex_S1_L001_R1_001.fastq.gz" ]; then
mkdir -p "$cellranger_tiny_fastq"
tar -xzf "$cellranger_tar_gz" \
-C "$cellranger_tiny_fastq" \
"cellranger-6.1.2/external/cellranger_tiny_fastq" \
--strip-components=3
fi
# untar ref
cellranger_tiny_ref="${OUT}/cellranger_tiny_ref"
if [ ! -f "${cellranger_tiny_ref}/reference.json" ]; then
mkdir -p "$cellranger_tiny_ref"
tar -xzf "$cellranger_tar_gz" \
-C "$cellranger_tiny_ref" \
"cellranger-6.1.2/external/cellranger_tiny_ref" \
--strip-components=3
fi
# Create ref with more recent STAR version
recent_ref_dir="${OUT}/cellranger_tiny_ref_v2_7_10_a"
if [ ! -f "${recent_ref_dir}/Genome" ]; then
mkdir -p "${recent_ref_dir}"
target/docker/mapping/star_build_reference/star_build_reference \
--genome_fasta "$cellranger_tiny_ref/fasta/genome.fa" \
--output "$recent_ref_dir" \
--genomeSAindexNbases 7 \
--transcriptome_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz"
fi
# run cellranger count
bam_dir="${OUT}/bam"
if [ ! -f "$bam_dir/possorted_genome_bam.bam" ]; then
mkdir -p "$bam_dir"
viash run src/mapping/cellranger_count/config.vsh.yaml -- \
--input "$cellranger_tiny_fastq" \
--reference "$cellranger_tiny_ref" \
--output "$bam_dir"
fi
# convert to h5mu
raw_h5mu="${OUT}/raw_dataset.h5mu"
if [ ! -f "$step1_h5mu" ]; then
viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
--input "${bam_dir}/raw_feature_bc_matrix.h5" \
--output "$raw_h5mu"
fi
# run velocyto
velo_gtf="$cellranger_tiny_ref/genes/genes.gtf.gz"
velo_bam="$bam_dir/possorted_genome_bam.bam"
velo_loom="${OUT}/velocyto.loom"
if [ ! -f "$velo_loom" ]; then
viash run src/velocity/velocyto/config.vsh.yaml -- \
--input "$velo_bam" \
--output "$velo_loom" \
--transcriptome "$velo_gtf"
fi
# combine raw counts with velocyto data
dataset_h5mu="${OUT}/dataset.h5mu"
if [ ! -f "$dataset_h5mu" ]; then
viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \
--input_loom "$velo_loom" \
--input_h5mu "$raw_h5mu" \
--output "$dataset_h5mu"
fi
# run htseq
htseq_counts="${OUT}/htseq_counts.tsv"
if [ ! -f "$htseq_counts" ]; then
viash run src/mapping/htseq_count/config.vsh.yaml -- \
--input "$velo_bam" \
--reference "$velo_gtf" \
--output "$htseq_counts"
fi
multi_star="${OUT}/multi_star"
if [ ! -d "$multi_star" ]; then
viash run src/mapping/multi_star/config.vsh.yaml -- \
--input_id "tinygex" \
--input_r1 "$cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz" \
--input_r2 "$cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz" \
--input_id "tinygex" \
--input_r1 "$cellranger_tiny_fastq/tinygex_S1_L002_R1_001.fastq.gz" \
--input_r2 "$cellranger_tiny_fastq/tinygex_S1_L002_R2_001.fastq.gz" \
--reference_index "$recent_ref_dir" \
--reference_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz" \
--output "$multi_star" \
---cpus 30
fi

View File

@@ -0,0 +1,72 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
# The output folder
OUT="resources_test/concat_test_data/"
# create it if it doesn't exist already
[ -d "$OUT" ] || mkdir -p "$OUT"
echo "> Downloading files"
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/e18_mouse_brain_fresh_5k/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5 \
--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5"
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/human_brain_3k/human_brain_3k_filtered_feature_bc_matrix.h5 \
--output "${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5"
echo "> Converting to h5mu"
viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \
--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu"
viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
--input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5" \
--output "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu"
echo "> Subsetting datasets"
viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
--input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu" \
--output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
--number_of_observations 2000
viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \
--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
--number_of_observations 2000
echo "Making observation ids unique (required for concat component to function)"
viash run src/metadata/add_id/config.vsh.yaml -- \
--input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
--output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
--input_id "human" \
--make_observation_keys_unique
viash run src/metadata/add_id/config.vsh.yaml -- \
--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
--input_id "mouse" \
--make_observation_keys_unique
echo "Removing temp files"
rm "${OUT}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \
"$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \
"$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
"$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
"${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5mu" \
"$OUT/human_brain_3k_filtered_feature_bc_matrix.h5"
echo "> Running concat component"
viash run src/dataflow/concat/config.vsh.yaml -- \
--input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu,$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
--input_id "human,mouse" \
--output "$OUT/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"

View File

@@ -0,0 +1,42 @@
#!/bin/bash
set -eo pipefail
# settings
ID=demuxafy_test_data
OUT=resources_test/$ID
DIR="$OUT"
mkdir -p "$OUT"
cd "$OUT"
# download demuxafy test dataset
wget https://www.dropbox.com/s/m8u61jn4i1mcktp/TestData4PipelineSmall.tar.gz
tar -xf TestData4PipelineSmall.tar.gz
# bam and vcf file
cp TestData4PipelineSmall/test_dataset.vcf .
# extract chr from vcf file
grep -w '^#\|^#CHROM\|^[1-2]' test_dataset.vcf > test_dataset_chr1_2.vcf
grep -w '^#\|^#CHROM\|^[3-4]' test_dataset.vcf > test_dataset_chr3_4.vcf
# barcode list
cp TestData4PipelineSmall/test_dataset/outs/filtered_gene_bc_matrices/Homo_sapiens_GRCh38p10/barcodes.tsv .
# subsetted bam and bai for souporcell
wget https://www.dropbox.com/s/7ew5lt0msf4z5gj/chr_1_pooled.sorted.bam
wget https://www.dropbox.com/s/tpplbj9sab9b2p4/chr_1_pooled.sorted.bam.bai
# variants from mixed sample
wget https://www.dropbox.com/s/btir7ge4kzc7tu1/mixed_variant.vcf
# dsc_pileup output
wget https://www.dropbox.com/s/17hj9i0yavtezx1/dsc_pileup.zip
unzip dsc_pileup.zip
# subsetted human genome reference
wget https://www.dropbox.com/s/ynlce3g7nwxthwg/genome_chr1.fa
# remove unnecessary files
rm -rf TestData4PipelineSmall
rm TestData4PipelineSmall.tar.gz
rm dsc_pileup.zip

View File

@@ -0,0 +1,58 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
ID=HLCA_reference_model
OUT=resources_test/$ID/$ID
DIR=$(dirname "$OUT")
# ideally, this would be a versioned pipeline run
[ -d "$DIR" ] || mkdir -p "$DIR"
# download and unarchive pre-trained scANVI model
wget https://zenodo.org/record/6337966/files/HLCA_reference_model.zip \
-O "${OUT}.zip"
# # Test query data
# # Source publication: Delorey, Toni M., et al. “COVID-19 tissue atlases reveal SARS-CoV-2 pathology and cellular targets.” Nature 595.7865 (2021): 107-113.
# wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5230nnn/GSM5230027/suppl/GSM5230027_04-P103142-S149-R01_raw_feature_bc_matrix.h5.gz \
# -O "${OUT}_query_test.h5.gz"
# gzip -d "${OUT}_query_test.h5.gz"
# # Prepare test data as in scvi-tools tutorial: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/query_hlca_knn.html
# python <<HEREDOC
# import pandas as pd
# import scanpy as sc
# geo_metadata_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE171nnn/GSE171668/suppl/GSE171668_lung_metadata.csv.gz"
# metadata = pd.read_csv(geo_metadata_url, index_col=0)
# DATA_PATH = "${OUT}_query_test.h5"
# query_data = sc.read_10x_h5(DATA_PATH)
# # clean up .var.index (gene names)
# query_data.var['gene_names'] = query_data.var.index
# query_data.var.index = [idx.split("___")[-1] for idx in query_data.var.gene_ids]
# # clean up cell barcodes:
# query_data.obs.index = query_data.obs.index.str.rstrip("-1")
# # read in metadata (to select only cells of interest and remove empty drops)
# # subset to cells from our sample
# metadata = metadata.loc[metadata.donor == "D12_4",:].copy()
# # clean up barcodes:
# metadata.index = [idx.split("-")[-1] for idx in metadata.index]
# # subset adata to cells in metadata:
# query_data = query_data[metadata.index,:].copy()
# # add dataset information:
# query_data.obs['dataset'] = "test_dataset_delorey_regev"
# sc.write(DATA_PATH, query_data)
# HEREDOC
# # convert 10x h5 to h5mu
# viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml -- \
# --input "${OUT}_query_test.h5" \
# --output "${OUT}_query_test.h5mu"

View File

@@ -0,0 +1,16 @@
#!/bin/bash
# settings
ID=merge_test_data
OUT=resources_test/$ID
DIR="$OUT"
mkdir -p "$OUT"
target/docker/dataflow/split_modalities/split_modalities \
--input resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu \
--output "$OUT"

View File

@@ -0,0 +1,124 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
ID=pbmc_1k_protein_v3
OUT=resources_test/$ID/$ID
DIR=$(dirname "$OUT")
# ideally, this would be a versioned pipeline run
[ -d "$DIR" ] || mkdir -p "$DIR"
# dataset page:
# https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-gene-expression-and-cell-surface-protein-3-standard-3-0-0
# download metrics summary
wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_metrics_summary.csv \
-O "${OUT}_metrics_summary.csv"
# download counts h5 file
wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5 \
-O "${OUT}_filtered_feature_bc_matrix.h5"
wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \
-O "${OUT}_raw_feature_bc_matrix.h5"
# download counts matrix tar gz file
wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.tar.gz \
-O "${OUT}_filtered_feature_bc_matrix.tar.gz"
# extract matrix tar gz
mkdir -p "${OUT}_filtered_feature_bc_matrix"
tar -xvf "${OUT}_filtered_feature_bc_matrix.tar.gz" \
-C "${OUT}_filtered_feature_bc_matrix" \
--strip-components 1
rm "${OUT}_filtered_feature_bc_matrix.tar.gz"
# convert 10x h5 to h5mu
target/docker/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu \
--input "${OUT}_filtered_feature_bc_matrix.h5" \
--input_metrics_summary "${OUT}_metrics_summary.csv" \
--output "${OUT}_filtered_feature_bc_matrix.h5mu"
# run single sample
nextflow \
run . \
-main-script target/nextflow/workflows/rna/rna_singlesample/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_uss \
--input "${OUT}_filtered_feature_bc_matrix.h5mu" \
--output "`basename $OUT`_uss.h5mu" \
--publishDir `dirname $OUT` \
-resume
# add the sample ID to the mudata object
nextflow \
run . \
-main-script target/nextflow/metadata/add_id/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_uss \
--input "${OUT}_uss.h5mu" \
--input_id "pbmc_1k_protein_v3_uss" \
--output "`basename $OUT`_uss_with_id.h5mu" \
--output_compression "gzip" \
--publishDir `dirname $OUT` \
-resume
# run multisample
nextflow \
run . \
-main-script target/nextflow/workflows/rna/rna_multisample/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_ums \
--input "${OUT}_uss_with_id.h5mu" \
--output "`basename $OUT`_ums.h5mu" \
--publishDir `dirname $OUT` \
-resume
rm "${OUT}_uss_with_id.h5mu"
# run dimred
nextflow \
run . \
-main-script target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_mms \
--input "${OUT}_ums.h5mu" \
--output "`basename $OUT`_mms.h5mu" \
--publishDir `dirname $OUT` \
--obs_covariates sample_id \
-resume
# run integration
nextflow \
run . \
-main-script target/nextflow/workflows/integration/harmony_leiden/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_mms_integration \
--input "${OUT}_mms.h5mu" \
--output "`basename $OUT`_mms.h5mu" \
--publishDir `dirname $OUT` \
--obs_covariates sample_id \
-resume
python <<HEREDOC
import mudata as mu
mudata = mu.read_h5mu("${DIR}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu")
mudata.mod["rna"].write_h5ad("${DIR}/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad")
HEREDOC
# Convert to Seurat
viash run src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml -- \
--input "${OUT}_mms.h5mu" \
--output "${OUT}_mms.rds"

View File

@@ -0,0 +1,61 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=reference_gencodev41_chr1
OUT=resources_test/$ID
mkdir -p "$OUT"
wget "https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" -O "$OUT/ERCC92.zip"
# Download JASPAR files for reference building
# Source of the code below: https://support.10xgenomics.com/single-cell-atac/software/release-notes/references#GRCh38-2020-A-2.0.0
motifs_url="https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
motifs_in="${OUT}/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
if [ ! -f "$motifs_in" ]; then
curl -sS "$motifs_url" > "$motifs_in"
fi
# Change motif headers so the human-readable motif name precedes the motif
# identifier. So ">MA0004.1 Arnt" -> ">Arnt_MA0004.1".
motifs_modified="${OUT}/$(basename "$motifs_in").modified"
awk '{
if ( substr($1, 1, 1) == ">" ) {
print ">" $2 "_" substr($1,2)
} else {
print
}
}' "$motifs_in" > "$motifs_modified"
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
genome_fasta: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz"
transcriptome_gtf: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz"
target: ["bd_rhapsody", "cellranger_arc"]
output_fasta: "reference.fa.gz"
output_gtf: "reference.gtf.gz"
non_nuclear_contigs: null
output_cellranger_arc: "reference_cellranger.tar.gz"
output_bd_rhapsody: "reference_bd_rhapsody.tar.gz"
bdrhap_extra_star_params: "--genomeSAindexNbases 12 --genomeSAsparseD 2"
motifs_file: "$motifs_modified"
subset_regex: "chr1"
HERE
nextflow \
run . \
-main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
-profile docker \
-c ./src/workflows/utils/labels_ci.config \
-params-file /tmp/params.yaml \
--publish_dir $OUT \
-resume

View File

@@ -0,0 +1,51 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
mkdir -p "resources_test/remote_param_list/"
OUT=resources_test/remote_param_list/test_param_list.yaml
OUT_CSV=resources_test/remote_param_list/test_param_list.csv
OUT_JSON=resources_test/remote_param_list/test_param_list.json
cat > $OUT << HERE
- id: "mouse"
input: s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
publish_dir: "foo_remote/"
rna_min_counts: 2
prot_min_counts: 3
- id: "human"
input: s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
publish_dir: "foo_remote/"
rna_min_counts: 2
prot_min_counts: 3
HERE
cat > $OUT_CSV << EOF
"id","input","publish_dir","rna_min_counts","prot_min_counts"
"mouse","s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3"
"human","s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3"
EOF
cat > $OUT_JSON << HERE
[
{
"id": "mouse",
"input": "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu",
"publish_dir": "foo_remote/",
"rna_min_counts": 2,
"prot_min_counts": 3
},
{
"id": "human",
"input": "s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu",
"publish_dir": "foo_remote/",
"rna_min_counts": 2,
"prot_min_counts": 3
}
]
HERE

View File

@@ -0,0 +1,60 @@
#!/bin/bash
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=rna_velocity
OUT=resources_test/$ID
# create raw directory
velocyto_dir="$OUT/velocyto"
mkdir -p "$velocyto_dir"
########################################################
# Create a compatible BAM file from BD Rhapsody Output #
########################################################
bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/output_raw/Combined_sample_Bioproduct.bam"
if [[ ! -f "$bd_rhap_wta_bam" ]]; then
echo "$bd_rhap_wta_bam does not exist. Please generate BD Rhapsody test data first."
exit 1
fi
echo "> Converting BD Rhapsody barcode tags."
viash run src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml -- \
-i "$bd_rhap_wta_bam" \
-o "$velocyto_dir/compatible_bd_input.bam" \
--bam \
-t 4
echo "> Creating barcodes file."
samtools view -@4 "$velocyto_dir/compatible_bd_input.bam" | \
grep -oP "(?<=CB:Z:)\S+" | sort | uniq | head > "$velocyto_dir/barcodes.txt"
###########################################################
# Process Tiny Fast Fastq dataset from 10X to create #
# input data for convert/from_velocyto_to_h5mu compontent #
###########################################################
mkdir "$OUT/velocyto_processed"
gtf="resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz"
bam="resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam"
echo "> Processing 10x dataset"
viash run src/velocity/velocyto/config.vsh.yaml -- \
-i "$bam" \
-o "$OUT/velocyto_processed/cellranger_tiny.loom" \
--transcriptome "$gtf"
echo "> Converting loom file to MuData object"
viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \
--input_loom "$OUT/velocyto_processed/cellranger_tiny.loom" \
--input_h5mu "resources_test/cellranger_tiny_fastq/raw_dataset.h5mu" \
--modality velocyto \
--output_compression "gzip" \
--output "$OUT/velocyto_processed/velocyto.h5mu"

View File

@@ -0,0 +1,24 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=tiledb
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/"
mkdir -p "$raw_dir"
viash run src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml -- \
--input resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu \
--tiledb_dir "$OUT"/pbmc_1k_protein_v3_mms \
--rna_modality "rna" \
--rna_raw_layer_input "X" \
--rna_normalized_layer_input "log_normalized" \
--rna_var_gene_names_input "gene_symbol"

View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -eo pipefail
# settings
ID=vireo_test_data
OUT=resources_test/$ID
DIR="$OUT"
mkdir -p "$OUT"
cd "$OUT"
# download vireo tutorial dataset
wget https://github.com/single-cell-genetics/vireo/raw/master/data/cells.cellSNP.vcf.gz

43
ruff.toml Normal file
View File

@@ -0,0 +1,43 @@
# Exclude a variety of commonly ignored directories.
exclude = [
".git",
".pyenv",
".pytest_cache",
".ruff_cache",
".venv",
".vscode",
"__pypackages__",
"_build",
"build",
"dist",
"node_modules",
"site-packages",
]
builtins = ["meta"]
[format]
# Like Black, use double quotes for strings.
quote-style = "double"
# Like Black, indent with spaces, rather than tabs.
indent-style = "space"
# Like Black, respect magic trailing commas.
skip-magic-trailing-comma = false
# Like Black, automatically detect the appropriate line ending.
line-ending = "auto"
[lint.flake8-pytest-style]
fixture-parentheses = false
mark-parentheses = false
[lint]
ignore = [
# module level import not at top of file
"E402"
]

View File

@@ -0,0 +1,165 @@
name: celltypist
namespace: annotate
scope: "public"
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
default: log_normalized
description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used.
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.
- name: "--sanitize_ensembl_ids"
type: boolean
description: Whether to sanitize ensembl ids by removing version numbers.
default: true
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used.
required: false
- name: "--reference_obs_target"
type: string
description: The name of the adata obs column in the reference data containing cell type annotations.
default: "cell_ontology_class"
- name: "--reference_var_input"
type: string
default: "filter_with_hvg"
required: false
description: |
.var column containing highly variable genes. If not provided, genes will not be subset.
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--feature_selection"
type: boolean
description: "Whether to perform feature selection."
default: false
- name: "--majority_voting"
type: boolean
description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
default: false
- name: "--C"
type: double
description: "Inverse of regularization strength in logistic regression."
default: 1.0
- name: "--max_iter"
type: integer
description: "Maximum number of iterations before reaching the minimum of the cost function."
default: 1000
- name: "--use_SGD"
type: boolean_true
description: "Whether to use the stochastic gradient descent algorithm."
- name: "--min_prop"
type: double
description: |
"For the dominant cell type within a subcluster, the minimum proportion of cells required to
support naming of the subcluster by this cell type. Ignored if majority_voting is set to False.
Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
default: 0
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_predictions"
type: string
default: celltypist_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: celltypist_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
__merge__: [., /src/base/h5_compression_argument.yaml]
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/cross_check_genes.py
- path: /src/utils/subset_vars.py
- path: /src/utils/set_var_index.py
- path: /src/utils/is_lognormalized.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: nvcr.io/nvidia/pytorch:25.11-py3
setup:
- type: python
packages:
- celltypist==1.7.1
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
test_setup:
- type: python
__merge__: [ /src/base/requirements/scanpy.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [highcpu, highmem, highdisk, gpu]

View File

@@ -0,0 +1,166 @@
import sys
import celltypist
import mudata as mu
import anndata as ad
import pandas as pd
from torch.cuda import is_available as cuda_is_available
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu",
"output": "output.h5mu",
"modality": "rna",
# "reference": None,
"reference_var_input": None,
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
"model": None,
# "model": "resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl",
"input_layer": None,
"reference_layer": "log_normalized",
"input_reference_gene_overlap": 100,
"reference_obs_target": "cell_ontology_class",
"feature_selection": True,
"majority_voting": True,
"output_compression": "gzip",
"input_var_gene_names": "dup_idx",
"reference_var_gene_names": "ensemblid",
"C": 1.0,
"max_iter": 1000,
"use_SGD": False,
"min_prop": 0,
"output_obs_predictions": "celltypist_pred",
"output_obs_probability": "celltypist_probability",
"sanitize_ensembl_ids": False,
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
from cross_check_genes import cross_check_genes
from set_var_index import set_var_index
from subset_vars import subset_vars
from is_lognormalized import is_lognormalized
logger = setup_logger()
use_gpu = cuda_is_available()
logger.info("GPU enabled? %s", use_gpu)
def main(par):
if (not par["model"] and not par["reference"]) or (
par["model"] and par["reference"]
):
raise ValueError(
"Make sure to provide either 'model' or 'reference', but not both."
)
input_mudata = mu.read_h5mu(par["input"])
input_adata = input_mudata.mod[par["modality"]]
input_modality = input_adata.copy()
# Provide correct format of query data for celltypist annotation
## Sanitize gene names and set as index
input_modality = set_var_index(
input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
)
## Fetch lognormalized counts
lognorm_counts = (
input_modality.layers[par["input_layer"]].copy()
if par["input_layer"]
else input_modality.X.copy()
)
if not is_lognormalized(lognorm_counts, target_sum=10000):
raise ValueError(
"Invalid expression matrix, expect input layer to contain log1p normalized expression to 10000 counts per cell."
)
## Create AnnData object
input_modality = ad.AnnData(
X=lognorm_counts, var=pd.DataFrame(index=input_modality.var.index)
)
if par["model"]:
logger.info("Loading CellTypist model")
model = celltypist.models.Model.load(par["model"])
cross_check_genes(
input_modality.var.index,
model.features,
min_gene_overlap=par["input_reference_gene_overlap"],
)
elif par["reference"]:
reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]]
# Check expression before subsetting to HVG
if not is_lognormalized(lognorm_counts, target_sum=10000):
raise ValueError(
"Invalid expression matrix, expect reference layer to contain log1p normalized expression to 10000 counts per cell."
)
# subset to HVG if required
if par["reference_var_input"]:
reference_modality = subset_vars(
reference_modality, par["reference_var_input"]
)
# Set var names to the desired gene name format (gene symbol, ensembl id, etc.)
# CellTypist requires query gene names to be in index
reference_modality = set_var_index(
reference_modality,
par["reference_var_gene_names"],
par["sanitize_ensembl_ids"],
)
# Ensure enough overlap between genes in query and reference
cross_check_genes(
input_modality.var.index,
reference_modality.var.index,
min_gene_overlap=par["input_reference_gene_overlap"],
)
reference_matrix = (
reference_modality.layers[par["reference_layer"]]
if par["reference_layer"]
else reference_modality.X
)
labels = reference_modality.obs[par["reference_obs_target"]]
logger.info("Training CellTypist model on reference")
model = celltypist.train(
reference_matrix,
labels=labels,
genes=reference_modality.var.index.astype(
str
), # ensure string type, fails if categorical when making var indices unique
C=par["C"],
max_iter=par["max_iter"],
use_SGD=par["use_SGD"],
feature_selection=par["feature_selection"],
check_expression=False, # if True, throws an error if lognormalized data are subset for HVG,
use_GPU=use_gpu,
)
logger.info("Predicting CellTypist annotations")
# Make sure .var index is string dtype, fails if categorical when making var indices unique
input_modality.var.index = input_modality.var.index.astype(str)
predictions = celltypist.annotate(
input_modality, model, majority_voting=par["majority_voting"], use_GPU=use_gpu
)
input_adata.obs[par["output_obs_predictions"]] = predictions.predicted_labels[
"predicted_labels"
].values
input_adata.obs[par["output_obs_probability"]] = predictions.probability_matrix.max(
axis=1
).values
# copy observations back to input data (with full set of features)
input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main(par)

View File

@@ -0,0 +1,327 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
import scanpy as sc
from openpipeline_testutils.asserters import assert_annotation_objects_equal
## VIASH START
meta = {
"executable": "./target/docker/annotate/celltypist/celltypist",
"resources_dir": "resources_test/",
"cpus": 15,
"memory_gb": 20,
"config": "src/annotate/celltypist/config.vsh.yaml",
}
## VIASH END
model_file = (
f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl"
)
input_file_1 = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
input_file_2 = f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
def log_normalize(adata):
adata_norm = sc.pp.normalize_total(adata, target_sum=1e4, copy=True)
adata_lognorm = sc.pp.log1p(adata_norm, copy=True)
adata.layers["log_normalized"] = adata_lognorm.X
return adata
def calculate_hvg(adata, n_top_genes=1000):
adata_hvg = sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, copy=True)
adata.var["filter_with_hvg"] = adata_hvg.var["highly_variable"]
return adata
@pytest.fixture
def reference_mdata():
mdata = mu.read_h5mu(reference_file)
adata = mdata.mod["rna"] # already has layer "log_normalized" with 10k target sum
adata.var["filter_with_hvg"] = adata.var[
"highly_variable"
] # already has highly variable genes calculated
return mdata
@pytest.fixture
def input_mdata():
mdata = mu.read_h5mu(input_file_1)
adata = mdata.mod["rna"].copy()
adata.layers["counts"] = adata.X.copy() # store raw counts in a layer
adata_lognorm = log_normalize(adata)
mdata.mod["rna"] = adata_lognorm
return mdata
@pytest.fixture
def model_input_mdata():
mdata = mu.read_h5mu(input_file_2)
adata = mdata.mod["rna"].copy()
adata_lognorm = log_normalize(adata)
mdata.mod["rna"] = adata_lognorm
return mdata
def test_simple_execution(
run_component, random_h5mu_path, reference_mdata, input_mdata, write_mudata_to_file
):
output_file = random_h5mu_path()
input_file = write_mudata_to_file(input_mdata)
reference_file = write_mudata_to_file(reference_mdata)
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert {"celltypist_pred", "celltypist_probability"}.issubset(
output_mudata.mod["rna"].obs.keys()
), "Required keys not found in .obs"
predictions = output_mudata.mod["rna"].obs["celltypist_pred"]
assert predictions.dtype == "category", (
"Calculated predictions should be category dtype"
)
assert not all(predictions.isna()), "Not all predictions should be NA"
probabilities = output_mudata.mod["rna"].obs["celltypist_probability"]
assert probabilities.dtype == "float", (
"Calculated probabilities should be float dtype"
)
assert all(0 <= value <= 1 for value in probabilities), (
".obs at celltypist_probability has values outside the range [0, 1]"
)
def test_set_params(
run_component, random_h5mu_path, write_mudata_to_file, input_mdata, reference_mdata
):
input_file = write_mudata_to_file(input_mdata)
reference_file = write_mudata_to_file(reference_mdata)
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--feature_selection",
"True",
"--majority_voting",
"True",
"--C",
"0.5",
"--max_iter",
"100",
"--use_SGD",
"--min_prop",
"0.1",
"--output",
output_file,
"--output_compression",
"gzip",
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert {"celltypist_pred", "celltypist_probability"}.issubset(
output_mudata.mod["rna"].obs.keys()
), "Required keys not found in .obs"
obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
assert all(0 <= value <= 1 for value in obs_values), (
".obs at celltypist_probability has values outside the range [0, 1]"
)
def test_with_model(
run_component, random_h5mu_path, write_mudata_to_file, model_input_mdata
):
output_file = random_h5mu_path()
input_file = write_mudata_to_file(model_input_mdata)
run_component(
[
"--input",
input_file,
"--model",
model_file,
"--reference_layer",
"",
"--reference_obs_targets",
"cell_type",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
output_mudata = mu.read_h5mu(output_file)
assert {"celltypist_pred", "celltypist_probability"}.issubset(
output_mudata.mod["rna"].obs.keys()
), "Required keys not found in .obs"
predictions = output_mudata.mod["rna"].obs["celltypist_pred"]
assert predictions.dtype == "category", (
"Calculated predictions should be category dtype"
)
assert not all(predictions.isna()), "Not all predictions should be NA"
probabilities = output_mudata.mod["rna"].obs["celltypist_probability"]
assert probabilities.dtype == "float", (
"Calculated probabilities should be float dtype"
)
assert all(0 <= value <= 1 for value in probabilities), (
".obs at celltypist_probability has values outside the range [0, 1]"
)
def test_fail_invalid_input_expression(
run_component, random_h5mu_path, write_mudata_to_file, input_mdata, reference_mdata
):
output_file = random_h5mu_path()
input_file = write_mudata_to_file(input_mdata)
reference_file = write_mudata_to_file(reference_mdata)
# fails because input data are not correctly lognormalized
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_file,
"--input_layer",
"counts",
"--reference",
reference_file,
"--reference_layer",
"log_normalized",
"--reference_var_gene_names",
"ensemblid",
"--output",
output_file,
]
)
assert re.search(
r"Invalid expression matrix, expect input layer to contain log1p normalized expression to 10000 counts per cell",
err.value.stdout.decode("utf-8"),
)
def test_with_hvg(
run_component, random_h5mu_path, write_mudata_to_file, input_mdata, reference_mdata
):
output_file = random_h5mu_path()
input_file = write_mudata_to_file(input_mdata)
reference_file = write_mudata_to_file(reference_mdata)
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--reference_var_input",
"highly_variable",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert {"celltypist_pred", "celltypist_probability"}.issubset(
output_mudata.mod["rna"].obs.keys()
), "Required keys not found in .obs"
obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
assert all(0 <= value <= 1 for value in obs_values), (
".obs at celltypist_probability has values outside the range [0, 1]"
)
def test_duplicate_categorical_index_entry(
run_component, random_h5mu_path, reference_mdata, input_mdata, write_mudata_to_file
):
"""Test that the component handles duplicate index entries correctly."""
output_file = random_h5mu_path()
# Create a modified input with duplicate index entries
indices = input_mdata.mod["rna"].var["gene_symbol"].to_list()
indices[0] = indices[1] # duplicate the first index entry
import pandas as pd
input_mdata.mod["rna"].var["dup_idx"] = pd.CategoricalIndex(indices)
input_file = write_mudata_to_file(input_mdata)
reference_file = write_mudata_to_file(reference_mdata)
# Component should not raise with duplicate categorical indices
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"feature_name",
"--input_var_gene_names",
"dup_idx",
"--sanitize_ensembl_ids",
"False",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,111 @@
name: consensus_vote
namespace: annotate
scope: "public"
description: |
Combines cell type predictions from multiple annotation methods into a single consensus prediction using a weighted majority vote.
For each cell, each method votes for its predicted cell type, optionally weighted by the probability score and/or a per-method weight.
The consensus prediction is the cell type with the highest total weighted vote.
Note that this method does not leverage pre-existing ontology or perform any reconciliation of cell type labels across methods, so the same cell type may be represented by different labels in different methods and will be treated as distinct cell types in the vote.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Input dataset arguments.
arguments:
- name: "--input"
type: file
description: Input h5mu file containing cell type predictions in .obs.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_obs_predictions"
type: string
description: |
One or more .obs column names containing cell type predictions (labels) from
different annotation methods.
required: true
multiple: true
example: ["scanvi_pred", "celltypist_pred"]
- name: "--input_obs_probabilities"
type: string
description: |
One or more .obs column names containing prediction probability scores,
one per method in --input_obs_predictions. When provided, each method's
vote is scaled by the probability score for that cell (in addition to
any per-method --weights). Must be the same length as --input_obs_predictions.
required: false
multiple: true
example: ["scanvi_prob", "celltypist_prob", "singler_prob"]
- name: "--tie_label"
type: string
description: |
Label to assign when two or more cell types receive equal votes.
If not provided, tied cells are assigned None (missing value).
required: false
example: "Unknown"
- name: "--weights"
type: double
description: |
Per-method weights for the consensus vote. Must be the same length as
--input_obs_predictions when provided. Weights are normalized to sum to 1
before use. If not provided, all methods are weighted equally.
required: false
multiple: true
example: [1.0, 2.0]
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
alternatives: [-o]
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_predictions"
type: string
default: consensus_pred
required: false
description: |
In which `.obs` slot to store the consensus predicted cell type.
- name: "--output_obs_score"
type: string
default: consensus_score
required: false
description: |
In which `.obs` slot to store the consensus score, defined as the fraction
of total weight assigned to the winning cell type.
__merge__: [., /src/base/h5_compression_argument.yaml]
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/compress_h5mu.py
test_resources:
- type: python_script
path: test.py
engines:
- type: docker
image: python:3.13-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [lowcpu, lowmem, lowdisk]

View File

@@ -0,0 +1,123 @@
import sys
import mudata as mu
import numpy as np
import pandas as pd
## VIASH START
par = {
"input": "test_with_probabilities.h5mu",
"modality": "rna",
"input_obs_predictions": ["scanvi_pred", "celltypist_pred", "singler_pred"],
"input_obs_probabilities": ["scanvi_prob", "celltypist_prob", "singler_prob"],
"weights": None,
"tie_label": None,
"output": "consensus_test_output.h5mu",
"output_obs_predictions": "consensus_pred",
"output_obs_score": "consensus_score",
"output_compression": "gzip",
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
from compress_h5mu import write_h5ad_to_h5mu_with_compression
logger = setup_logger()
def main():
prediction_cols = par["input_obs_predictions"]
prob_cols = par["input_obs_probabilities"]
weights = par["weights"]
if weights and len(weights) != len(prediction_cols):
raise ValueError(
f"--weights must have the same length as --input_obs_predictions. "
f"Got {len(weights)} weights for {len(prediction_cols)} prediction columns."
)
if prob_cols and len(prob_cols) != len(prediction_cols):
raise ValueError(
f"--input_obs_probabilities must have the same length as --input_obs_predictions. "
f"Got {len(prob_cols)} probability columns for {len(prediction_cols)} prediction columns."
)
logger.info("Reading input data.")
adata = mu.read_h5ad(par["input"], mod=par["modality"])
cols_to_check = [prediction_cols]
if prob_cols:
cols_to_check.append(prob_cols)
for cols in cols_to_check:
for col in cols:
if col not in adata.obs.columns:
raise ValueError(f"Column '{col}' not found in .obs.")
# Each method is treated equally by default, unless user specific weights are provided
n_methods = len(prediction_cols)
logger.info("Initializing weights to matrix of ones")
weights_arr = np.ones(n_methods, dtype=np.float32)
if weights:
logger.info("Applying user-provided weights.")
weights_arr = np.array(weights, dtype=np.float32)
logger.info("Normalizing weights")
weights_arr = weights_arr / weights_arr.sum()
# Apply the weights to the probabilities in the data
weights = pd.DataFrame(
[weights_arr] * adata.n_obs, index=adata.obs.index, columns=prediction_cols
)
if prob_cols:
logger.info("Scaling the weights with the probabilities from each method")
weights = weights * adata.obs[prob_cols].astype(np.float32).to_numpy()
assert pd.notna(weights).all(axis=None)
logger.info("Computing weighted majority vote.")
pred_df = adata.obs[prediction_cols].astype(str)
# For each cell and each method (index), get the label and the weight
incidences_weights = pd.DataFrame(
{"label": pred_df.stack(), "weights": weights.stack()}
)
# Move the label to the index, there might be duplicate indices now
incidences_weights = incidences_weights.set_index("label", append=True).rename_axis(
["cell_id", "method", "label"]
)
# Sum the weights per label, from this the labels with the largest weights need to be selected
summed_weights = incidences_weights.groupby(level=["cell_id", "label"]).sum()
# Find the weight that is the largest per group
max_weight_per_group = summed_weights.groupby(level="cell_id").transform("max")
# Use the value to look-up the corresponding IDs and labels
max_weights_mask = summed_weights["weights"] == max_weight_per_group["weights"]
entries_for_max_weights = summed_weights[max_weights_mask].reset_index(
level="label"
)
# Find the cases where there is a tie
is_duplicated = max_weights_mask.groupby(level="cell_id").sum() > 1
# For the ties, overwrite the label. If a cell is in the frame more than once it is because of a tie.
entries_for_max_weights.loc[is_duplicated, ["label"]] = par["tie_label"]
# Now its safe to just take the first index in case of duplicates, since the label and the score is the same.
entries_for_max_weights = entries_for_max_weights[
~entries_for_max_weights.index.duplicated()
]
# Normalize the weights
normalized_scores = (
entries_for_max_weights["weights"]
/ incidences_weights["weights"].groupby(level="cell_id").sum()
)
# Handle devision by 0
normalized_scores = normalized_scores.replace([np.inf, -np.inf], 0.0).fillna(0.0)
logger.info("Moving the output to the anndata.")
adata.obs[par["output_obs_predictions"]] = entries_for_max_weights["label"].astype(
"category"
)
adata.obs[par["output_obs_score"]] = normalized_scores
logger.info("Writing output data...")
write_h5ad_to_h5mu_with_compression(
par["output"], par["input"], par["modality"], adata, par["output_compression"]
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,303 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
import anndata as ad
import numpy as np
import pandas as pd
from openpipeline_testutils.asserters import assert_annotation_objects_equal
## VIASH START
meta = {
"resources_dir": "resources_test",
"executable": "./target/docker/annotate/consensus_vote",
"config": "./src/annotate/consensus_vote/config.vsh.yaml",
}
## VIASH END
@pytest.fixture
def input_mdata(write_mudata_to_file):
np.random.seed(42)
n_obs = 100
cell_types = ["T cell", "B cell", "NK cell", "Monocyte"]
obs = pd.DataFrame(index=[f"cell_{i}" for i in range(n_obs)])
for method in ["scanvi", "celltypist", "singler"]:
preds = np.random.choice(cell_types, size=n_obs)
obs[f"{method}_pred"] = pd.Categorical(preds, categories=cell_types)
obs[f"{method}_prob"] = np.random.uniform(0.5, 1.0, size=n_obs)
rna = ad.AnnData(
X=np.random.rand(n_obs, 10),
obs=obs,
)
prot = ad.AnnData(X=np.random.rand(n_obs, 5), obs=pd.DataFrame(index=obs.index))
mdata = mu.MuData({"rna": rna, "prot": prot})
return write_mudata_to_file(mdata)
@pytest.fixture
def tied_mdata(write_mudata_to_file):
"""Fixture where all cells have a guaranteed tie: two methods always disagree."""
n_obs = 10
obs = pd.DataFrame(index=[f"cell_{i}" for i in range(n_obs)])
obs["method_a_pred"] = pd.Categorical(["T cell"] * n_obs)
obs["method_b_pred"] = pd.Categorical(["B cell"] * n_obs)
rna = ad.AnnData(X=np.random.rand(n_obs, 5), obs=obs)
mdata = mu.MuData({"rna": rna})
return write_mudata_to_file(mdata)
def test_simple_execution(run_component, random_h5mu_path, input_mdata):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_mdata,
"--input_obs_predictions",
"scanvi_pred",
"--input_obs_predictions",
"celltypist_pred",
"--input_obs_predictions",
"singler_pred",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist."
input_mudata = mu.read_h5mu(input_mdata)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert {"consensus_pred", "consensus_score"}.issubset(
output_mudata.mod["rna"].obs.keys()
), "Required keys not found in .obs."
predictions = output_mudata.mod["rna"].obs["consensus_pred"]
assert predictions.dtype == "category", (
"Consensus predictions should be category dtype."
)
assert not predictions.isna().all(), "Not all consensus predictions should be NA."
scores = output_mudata.mod["rna"].obs["consensus_score"]
assert all(0 <= v <= 1 for v in scores), (
"Consensus scores should be in the range [0, 1]."
)
def test_with_weights(run_component, random_h5mu_path, input_mdata):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_mdata,
"--input_obs_predictions",
"scanvi_pred",
"--input_obs_predictions",
"celltypist_pred",
"--input_obs_predictions",
"singler_pred",
"--weights",
"1.0",
"--weights",
"1.0",
"--weights",
"2.0",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist."
output_mudata = mu.read_h5mu(output_file)
assert {"consensus_pred", "consensus_score"}.issubset(
output_mudata.mod["rna"].obs.keys()
), "Required keys not found in .obs."
def test_custom_output_obs_names(run_component, random_h5mu_path, input_mdata):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_mdata,
"--input_obs_predictions",
"scanvi_pred",
"--input_obs_predictions",
"celltypist_pred",
"--output_obs_predictions",
"my_consensus_pred",
"--output_obs_score",
"my_consensus_score",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist."
output_mudata = mu.read_h5mu(output_file)
assert {"my_consensus_pred", "my_consensus_score"}.issubset(
output_mudata.mod["rna"].obs.keys()
), "Custom output keys not found in .obs."
def test_tie_returns_none_by_default(run_component, random_h5mu_path, tied_mdata):
output_file = random_h5mu_path()
run_component(
[
"--input",
tied_mdata,
"--input_obs_predictions",
"method_a_pred",
"--input_obs_predictions",
"method_b_pred",
"--output",
output_file,
]
)
output_mudata = mu.read_h5mu(output_file)
predictions = output_mudata.mod["rna"].obs["consensus_pred"]
assert predictions.isna().all(), "All tied predictions should be None."
def test_tie_label(run_component, random_h5mu_path, tied_mdata):
output_file = random_h5mu_path()
run_component(
[
"--input",
tied_mdata,
"--input_obs_predictions",
"method_a_pred",
"--input_obs_predictions",
"method_b_pred",
"--tie_label",
"Unknown",
"--output",
output_file,
]
)
output_mudata = mu.read_h5mu(output_file)
predictions = output_mudata.mod["rna"].obs["consensus_pred"]
assert (predictions == "Unknown").all(), "All tied predictions should be 'Unknown'."
def test_with_probabilities(run_component, random_h5mu_path, input_mdata):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_mdata,
"--input_obs_predictions",
"scanvi_pred",
"--input_obs_predictions",
"celltypist_pred",
"--input_obs_predictions",
"singler_pred",
"--input_obs_probabilities",
"scanvi_prob",
"--input_obs_probabilities",
"celltypist_prob",
"--input_obs_probabilities",
"singler_prob",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist."
output_mudata = mu.read_h5mu(output_file)
assert {"consensus_pred", "consensus_score"}.issubset(
output_mudata.mod["rna"].obs.keys()
), "Required keys not found in .obs."
assert not output_mudata.mod["rna"].obs["consensus_pred"].isna().all(), (
"Not all probability-weighted consensus predictions should be NA."
)
def test_mismatched_probabilities_error(run_component, random_h5mu_path, input_mdata):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_mdata,
"--input_obs_predictions",
"scanvi_pred",
"--input_obs_predictions",
"celltypist_pred",
"--input_obs_probabilities",
"scanvi_prob",
"--output",
output_file,
]
)
assert re.search(
r"ValueError: --input_obs_probabilities must have the same length as --input_obs_predictions",
err.value.stdout.decode("utf-8"),
)
def test_mismatched_weights_error(run_component, random_h5mu_path, input_mdata):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_mdata,
"--input_obs_predictions",
"scanvi_pred",
"--input_obs_predictions",
"celltypist_pred",
"--weights",
"1.0",
"--output",
output_file,
]
)
assert re.search(
r"ValueError: --weights must have the same length as --input_obs_predictions",
err.value.stdout.decode("utf-8"),
)
def test_missing_prediction_column_error(run_component, random_h5mu_path, input_mdata):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_mdata,
"--input_obs_predictions",
"nonexistent_pred",
"--output",
output_file,
]
)
assert re.search(
r"ValueError: Column 'nonexistent_pred' not found in .obs",
err.value.stdout.decode("utf-8"),
)
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,173 @@
name: onclass
namespace: annotate
scope: "public"
description: |
OnClass is a python package for single-cell cell type annotation. It uses the Cell Ontology to capture the cell type similarity.
These similarities enable OnClass to annotate cell types that are never seen in the training data.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data to be used for cell type annotation if .X is not to be used.
required: false
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.
- name: "--sanitize_ensembl_ids"
type: boolean
description: Whether to sanitize ensembl ids by removing version numbers.
default: true
- name: Ontology
description: Ontology input files
arguments:
- name: "--cl_nlp_emb_file"
type: file
description: The .nlp.emb file with the cell type embeddings.
required: true
example: cl.ontology.nlp.emb
- name: "--cl_ontology_file"
type: file
description: The .ontology file with the cell type ontology.
required: true
example: cl.ontology
- name: "--cl_obo_file"
type: file
description: The .obo file with the cell type ontology.
required: true
example: cl.obo
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used.
required: false
- name: "--reference_obs_target"
type: string
description: The name of the adata obs column in the reference data containing cell type annotations.
example: "cell_ontology_class"
required: true
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--reference_var_input"
type: string
required: false
description: |
.var column containing highly variable genes. By default, do not subset genes.
- name: "--unknown_celltype"
type: string
default: "Unknown"
required: false
description: |
Label for unknown cell types.
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_predictions"
type: string
default: onclass_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: onclass_prob
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
__merge__: [., /src/base/h5_compression_argument.yaml]
- name: Model arguments
description: Model arguments
arguments:
- name: "--model"
type: string
description: |
"Pretrained model path without a file extension. If not provided, the model will be trained
on the reference data and --reference should be provided. The path namespace should contain:
- a .npz or .pkl file
- a .data file
- a .meta file
- a .index file
e.g. /path/to/model/pretrained_model_target1 as saved by OnClass."
required: false
direction: input
- name: "--max_iter"
type: integer
default: 30
required: false
description: Maximum number of iterations for training the model.
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/cross_check_genes.py
- path: /src/utils/subset_vars.py
- path: /src/utils/set_var_index.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: python:3.11
setup:
- type: python
packages:
- OnClass~=1.3
- tensorflow
- obonet
__merge__: [/src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [highcpu, highmem, highdisk]

View File

@@ -0,0 +1,218 @@
import sys
import mudata as mu
import numpy as np
from OnClass.OnClassModel import OnClassModel
import obonet
from typing import Dict, List, Tuple
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
"output": "output.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
"model": None,
# "reference": None,
# "model": "resources_test/annotation_test_data/onclass_model/example_file_model",
"reference_obs_target": "cell_ontology_class",
"input_layer": None,
"reference_layer": None,
"max_iter": 100,
"output_obs_predictions": None,
"output_obs_probability": None,
"cl_nlp_emb_file": "resources_test/annotation_test_data/ontology/cl.ontology.nlp.emb",
"cl_ontology_file": "resources_test/annotation_test_data/ontology/cl.ontology",
"cl_obo_file": "resources_test/annotation_test_data/ontology/cl.obo",
"output_compression": "gzip",
"input_var_gene_names": "gene_symbol",
"input_reference_gene_overlap": 100,
"reference_var_input": None,
"reference_var_gene_names": None,
"unkown_celltype": "Unknown",
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
from cross_check_genes import cross_check_genes
from set_var_index import set_var_index
from subset_vars import subset_vars
logger = setup_logger()
def map_celltype_to_ontology_id(
cl_obo_file: str,
) -> Tuple[Dict[str, str], Dict[str, str]]:
"""
Map cell type names to ontology IDs and vice versa.
Parameters
----------
cl_obo_file : str
Path to the cell ontology file.
Returns
-------
Tuple[Dict[str, str], Dict[str, str]]
A tuple of two dictionaries. The first dictionary maps cell ontology IDs to cell type names.
The second dictionary maps cell type names to cell ontology IDs.
"""
graph = obonet.read_obo(cl_obo_file)
cl_id_to_name = {id_: data.get("name") for id_, data in graph.nodes(data=True)}
cl_id_to_name = {k: v for k, v in cl_id_to_name.items() if v is not None}
name_to_cl_id = {v: k for k, v in cl_id_to_name.items()}
return cl_id_to_name, name_to_cl_id
def cell_type_prediction(
model: OnClassModel,
input_matrix: np.array,
input_features: List[str],
id_to_name: dict,
) -> Tuple[List[str], List[float]]:
"""
Predict cell types for input data and save results to Anndata obj.
Parameters
----------
model : OnClassModel
The OnClass model.
input_matrix : np.array
The input data matrix.
input_modality : ad.AnnData
The input data Anndata object.
id_to_name : dict
Dictionary mapping cell ontology IDs to cell type names.
Returns
-------
predictions: List[str]
The predicted cell types.
probabilities: List[float]
Probabilities of the predicted cell types.
"""
corr_test_feature = model.ProcessTestFeature(
test_feature=input_matrix,
test_genes=input_features,
log_transform=False,
)
onclass_pred = model.Predict(
corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0
)
pred_label = [model.i2co[ind] for ind in onclass_pred[2]]
pred_cell_type_label = [id_to_name[id] for id in pred_label]
prob_cell_type_label = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1)
return pred_cell_type_label, prob_cell_type_label
def main():
if (not par["model"] and not par["reference"]) or (
par["model"] and par["reference"]
):
raise ValueError(
"Make sure to provide either 'model' or 'reference', but not both."
)
logger.info("Reading input data")
input_mudata = mu.read_h5mu(par["input"])
input_adata = input_mudata.mod[par["modality"]]
input_modality = input_adata.copy()
# Set var names to the desired gene name format (gene symbol, ensembl id, etc.)
input_modality = set_var_index(
input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
)
input_matrix = (
input_modality.layers[par["input_layer"]]
if par["input_layer"]
else input_modality.X
)
# Onclass needs dense matrix format
input_matrix = input_matrix.toarray()
id_to_name, name_to_id = map_celltype_to_ontology_id(par["cl_obo_file"])
if par["model"]:
logger.info("Predicting cell types using pre-trained model")
model = OnClassModel(
cell_type_nlp_emb_file=par["cl_nlp_emb_file"],
cell_type_network_file=par["cl_ontology_file"],
)
model.BuildModel(use_pretrain=par["model"], ngene=None)
cross_check_genes(
model.genes, input_modality.var.index, par["input_reference_gene_overlap"]
)
elif par["reference"]:
logger.info("Reading reference data")
model = OnClassModel(
cell_type_nlp_emb_file=par["cl_nlp_emb_file"],
cell_type_network_file=par["cl_ontology_file"],
)
reference_mudata = mu.read_h5mu(par["reference"])
reference_modality = reference_mudata.mod[par["modality"]].copy()
reference_modality = set_var_index(
reference_modality,
par["reference_var_gene_names"],
par["sanitize_ensembl_ids"],
)
# subset to HVG if required
if par["reference_var_input"]:
reference_modality = subset_vars(
reference_modality, par["reference_var_input"]
)
cross_check_genes(
input_modality.var.index,
reference_modality.var.index,
par["input_reference_gene_overlap"],
)
reference_matrix = (
reference_modality.layers[par["reference_layer"]]
if par["reference_layer"]
else reference_modality.X
)
# Onclass needs dense matrix format
reference_matrix = reference_matrix.toarray()
logger.info("Training a model from reference...")
labels = reference_modality.obs[par["reference_obs_target"]].tolist()
labels_cl = [
name_to_id[label] if label in name_to_id else par["unknown_celltype"]
for label in labels
]
_ = model.EmbedCellTypes(labels_cl)
corr_train_feature, _, corr_train_genes, _ = model.ProcessTrainFeature(
train_feature=reference_matrix,
train_label=labels_cl,
train_genes=reference_modality.var.index,
test_feature=input_matrix,
test_genes=input_modality.var.index,
log_transform=False,
)
model.BuildModel(ngene=len(corr_train_genes))
model.Train(corr_train_feature, labels_cl, max_iter=par["max_iter"])
logger.info("Predicting cell types")
predictions, probabilities = cell_type_prediction(
model, input_matrix, input_modality.var.index, id_to_name
)
logger.info("Writing output data")
input_adata.obs[par["output_obs_predictions"]] = predictions
input_adata.obs[par["output_obs_probability"]] = probabilities
input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,181 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
from openpipeline_testutils.asserters import assert_annotation_objects_equal
## VIASH START
meta = {"resources_dir": "resources_test"}
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
cl_nlp_emb_file = (
f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology.nlp.emb"
)
cl_ontology_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology"
cl_obo_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.obo"
model_file = (
f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model"
)
def test_simple_execution(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--cl_nlp_emb_file",
cl_nlp_emb_file,
"--cl_ontology_file",
cl_ontology_file,
"--cl_obo_file",
cl_obo_file,
"--max_iter",
"10",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ["onclass_pred", "onclass_prob"]
obs_values = output_mudata.mod["rna"].obs["onclass_prob"]
assert all(0 <= value <= 1 for value in obs_values), (
".obs at cell_ontology_class_prob has values outside the range [0, 1]"
)
def test_custom_obs(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--output_obs_predictions",
"dummy_pred_1",
"--output_obs_probability",
"dummy_prob_1",
"--cl_nlp_emb_file",
cl_nlp_emb_file,
"--cl_ontology_file",
cl_ontology_file,
"--cl_obo_file",
cl_obo_file,
"--max_iter",
"10",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert set(output_mudata.mod["rna"].obs.keys()) == {"dummy_pred_1", "dummy_prob_1"}
obs_values = output_mudata.mod["rna"].obs["dummy_prob_1"]
assert all(0 <= value <= 1 for value in obs_values), (
".obs at dummy_prob_1 has values outside the range [0, 1]"
)
def test_no_model_no_reference_error(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_file,
"--output",
output_file,
"--cl_nlp_emb_file",
cl_nlp_emb_file,
"--cl_ontology_file",
cl_ontology_file,
"--cl_obo_file",
cl_obo_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
]
)
assert re.search(
r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
err.value.stdout.decode("utf-8"),
)
def test_pretrained_model(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--input_var_gene_names",
"gene_symbol",
"--sanitize_ensembl_ids",
"False",
"--cl_nlp_emb_file",
cl_nlp_emb_file,
"--cl_ontology_file",
cl_ontology_file,
"--cl_obo_file",
cl_obo_file,
"--reference_obs_target",
"cell_ontology_class",
"--model",
model_file,
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ["onclass_pred", "onclass_prob"]
obs_values = output_mudata.mod["rna"].obs["onclass_prob"]
assert all(0 <= value <= 1 for value in obs_values), (
".obs at cell_ontology_class_prob has values outside the range [0, 1]"
)
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,161 @@
name: popv
namespace: "annotate"
scope: "public"
description: "Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV."
authors:
- __merge__: /src/authors/matthias_beyens.yaml
roles: [ author ]
- __merge__: /src/authors/robrecht_cannoodt.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Arguments related to the input (aka query) dataset.
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: Input h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`.
required: false
- name: "--input_obs_batch"
type: string
description: Key in obs field of input adata for batch information. If no value is provided, batch label is assumed to be unknown.
required: false
- name: "--input_var_subset"
type: string
description: Subset the input object with this column.
required: false
- name: "--input_obs_label"
type: string
description: Key in obs field of input adata for label information. This is only used for training scANVI. Unlabelled cells should be set to `"unknown_celltype_label"`.
required: false
- name: "--unknown_celltype_label"
type: string
description: If `input_obs_label` is specified, cells with this value will be treated as unknown and will be predicted by the model.
default: "unknown"
required: false
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "User-provided reference tissue. The data that will be used as reference to call cell types."
example: TS_Bladder_filtered.h5ad
direction: input
required: true
- name: "--reference_layer"
type: string
description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`.
required: false
- name: "--reference_obs_label"
type: string
description: Key in obs field of reference AnnData with cell-type information.
default: "cell_ontology_class"
required: false
- name: "--reference_obs_batch"
type: string
description: Key in obs field of input adata for batch information.
default: "donor_assay"
required: false
# - name: "--reference_models"
# type: file
# description: Pretrained models. Can be a directory or a tar gz.
# required: false
# example: pretrained_models_Bladder_ts.tar.gz
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
required: true
example: output.h5mu
__merge__: [., /src/base/h5_compression_argument.yaml]
# - name: "--output_models"
# type: file
# direction: output
# description: If `prediction_mode == "retrain"`, saves models to a directory and compresses the results into a tar gz.
# example: "output.tar.gz"
# required: false
- name: Arguments
description: Other arguments.
arguments:
- name: "--methods"
type: string
description: "Methods to call cell types. By default, runs KNN_SCVI and SCANVI_POPV."
example: ["KNN_SCVI", "SCANVI_POPV"]
choices: [CELLTYPIST, KNN_BBKNN, KNN_HARMONY, KNN_SCANORAMA, KNN_SCVI, ONCLASS, Random_Forest, SCANVI_POPV, Support_Vector]
required: true
multiple: true
# - name: "--prediction_mode"
# type: string
# description: |
# Execution mode of cell-type annotation.
# "retrain": Train all prediction models and saves them to disk. Argument `output_models` must be defined.
# "inference": Classify all cells based on pretrained models. Argument `reference_models` must be defined.
# "fast": Fast inference using only query cells and single epoch in scArches.
# - name: "--plots"
# type: boolean
# description: "Creation of agreement and frequency plots between selected cell type algorithmn(s) and final PopV ensemble called cell type."
# default: false
# required: false
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
#image: nvcr.io/nvidia/pytorch:22.12-py3
image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
setup:
- type: docker
env:
# Build extensions without AVX512/AVX2 support
- CFLAGS="-mno-avx512f -mno-avx2"
- CPPFLAGS="-mno-avx512f -mno-avx2"
- type: apt
packages:
- procps
- git
- build-essential
- type: python
packages:
- popv~=0.6.1
# See https://github.com/YosefLab/PopV/issues/30
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
# download ontology required by popv
- type: docker
run: |
cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
# docker_run_args: ["--gpus all"]
- type: nextflow
directives:
# TODO: should add new label highmem-single-gpu and lowmem-single-gpu
label: [highmem, highcpu, highdisk]

227
src/annotate/popv/script.py Normal file
View File

@@ -0,0 +1,227 @@
import sys
import re
import tempfile
import typing
import numpy as np
import mudata as mu
import anndata as ad
import popv
# todo: is this still needed?
from torch.cuda import is_available as cuda_is_available
try:
from torch.backends.mps import is_available as mps_is_available
except ModuleNotFoundError:
# Older pytorch versions
# MacOS GPUs
def mps_is_available():
return False
# where to find the obo files
cl_obo_folder = "/opt/PopV/resources/ontology/"
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
# "input": "resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5ad",
"input_obs_batch": None,
"input_layer": None,
"input_obs_label": None,
"input_var_subset": None,
"unknown_celltype_label": "unknown",
"reference_layer": None,
"reference_obs_label": "cell_ontology_class",
"reference_obs_batch": "donor_assay",
"output": "output.h5mu",
"output_compression": "gzip",
"methods": [
# "celltypist",
# "knn_on_bbknn",
# "knn_on_scanorama",
# "knn_on_scvi",
"Random_Forest",
# "scanvi",
"svm",
],
}
meta = {"resources_dir": "src/utils", "temp_dir": "/tmp"}
# for debugging the obo folder can be somewhere local
cl_obo_folder = "popv_cl_ontology/"
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
use_gpu = cuda_is_available()
logger.info("GPU enabled? %s", use_gpu)
popv.settings.accelerator = "cuda" if use_gpu else "cpu"
# Helper functions
def get_X(
adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str]
):
"""Fetch the counts data from X or a layer. Subset columns by var_index if so desired."""
if var_index:
adata = adata[:, var_index]
if layer:
return adata.layers[layer]
else:
return adata.X
def get_obs(adata: ad.AnnData, obs_par_names):
"""Subset the obs dataframe to just the columns defined by the obs_label and obs_batch."""
obs_columns = [par[x] for x in obs_par_names if par[x]]
return adata.obs[obs_columns]
def get_var(adata: ad.AnnData, var_index: list[str]):
"""Fetch the var dataframe. Subset rows by var_index if so desired."""
return adata.var.loc[var_index]
def main(par, meta):
assert len(par["methods"]) >= 1, (
"Please, specify at least one method for cell typing."
)
logger.info("Cell typing methods: {}".format(par["methods"]))
### PREPROCESSING REFERENCE ###
logger.info("### PREPROCESSING REFERENCE ###")
# take a look at reference data
logger.info("Reading reference data '%s'", par["reference"])
reference = ad.read_h5ad(par["reference"])
logger.info("Setting reference var index to Ensembl IDs")
reference.var["gene_symbol"] = list(reference.var.index)
reference.var.index = [
re.sub("\\.[0-9]+$", "", s) for s in reference.var["ensemblid"]
]
logger.info("Detect number of samples per label")
min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size())
n_samples_per_label = np.max((min_celltype_size, 100))
### PREPROCESSING INPUT ###
logger.info("### PREPROCESSING INPUT ###")
logger.info("Reading '%s'", par["input"])
input = mu.read_h5mu(par["input"])
input_modality = input.mod[par["modality"]]
# subset with var column
if par["input_var_subset"]:
logger.info("Subset input with .var['%s']", par["input_var_subset"])
assert par["input_var_subset"] in input_modality.var, (
f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var"
)
input_modality = input_modality[:, input_modality.var[par["input_var_subset"]]]
### ALIGN REFERENCE AND INPUT ###
logger.info("### ALIGN REFERENCE AND INPUT ###")
logger.info("Detecting common vars based on ensembl ids")
common_ens_ids = list(
set(reference.var.index).intersection(set(input_modality.var.index))
)
logger.info(" reference n_vars: %i", reference.n_vars)
logger.info(" input n_vars: %i", input_modality.n_vars)
logger.info(" intersect n_vars: %i", len(common_ens_ids))
assert len(common_ens_ids) >= 100, "The intersection of genes is too small."
# subset input objects to make sure popv is using the data we expect
input_modality = ad.AnnData(
X=get_X(input_modality, par["input_layer"], common_ens_ids),
obs=get_obs(input_modality, ["input_obs_label", "input_obs_batch"]),
var=get_var(input_modality, common_ens_ids),
)
reference = ad.AnnData(
X=get_X(reference, par["reference_layer"], common_ens_ids),
obs=get_obs(reference, ["reference_obs_label", "reference_obs_batch"]),
var=get_var(reference, common_ens_ids),
)
# remove layers that
### ALIGN REFERENCE AND INPUT ###
logger.info("### ALIGN REFERENCE AND INPUT ###")
with tempfile.TemporaryDirectory(prefix="popv-", dir=meta["temp_dir"]) as temp_dir:
logger.info("Run PopV processing")
pq = popv.preprocessing.Process_Query(
# input
query_adata=input_modality,
query_batch_key=par["input_obs_batch"],
query_layer_key=None, # this is taken care of by subset
# reference
ref_adata=reference,
ref_layer_key=None, # this is taken care of by subset
ref_labels_key=par["reference_obs_label"],
ref_batch_key=par["reference_obs_batch"],
# options
unknown_celltype_label=par["unknown_celltype_label"],
n_samples_per_label=n_samples_per_label,
# pretrained model
# Might need to be parameterized at some point
prediction_mode="retrain",
pretrained_scvi_path=None,
# outputs
# Might need to be parameterized at some point
save_path_trained_models=temp_dir,
# hardcoded values
cl_obo_folder=cl_obo_folder,
)
method_kwargs = {}
if "scanorama" in par["methods"]:
method_kwargs["scanorama"] = {"approx": False}
logger.info("Annotate data")
popv.annotation.annotate_data(
adata=pq.adata, methods=par["methods"], methods_kwargs=method_kwargs
)
popv_input = pq.adata[input_modality.obs_names]
# select columns starting with "popv_"
popv_obs_cols = popv_input.obs.columns[
popv_input.obs.columns.str.startswith("popv_")
]
# create new data frame with selected columns
df_popv = popv_input.obs[popv_obs_cols]
# remove prefix from column names
df_popv.columns = df_popv.columns.str.replace("popv_", "")
# store output in mudata .obsm
input.mod[par["modality"]].obsm["popv_output"] = df_popv
# copy important output in mudata .obs
for col in ["popv_prediction"]:
if col in popv_input.obs.columns:
input.mod[par["modality"]].obs[col] = popv_input.obs[col]
# code to explore how the output differs from the original
# for attr in ["obs", "var", "uns", "obsm", "layers", "obsp"]:
# old_keys = set(getattr(pq_adata_orig, attr).keys())
# new_keys = set(getattr(pq.adata, attr).keys())
# diff_keys = list(new_keys.difference(old_keys))
# diff_keys.sort()
# print(f"{attr}:", flush=True)
# for key in diff_keys:
# print(f" {key}", flush=True)
# write output
logger.info("Writing %s", par["output"])
input.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main(par, meta)

95
src/annotate/popv/test.py Normal file
View File

@@ -0,0 +1,95 @@
import sys
import os
import pytest
import mudata as mu
## VIASH START
meta = {"resources_dir": "resources_test"}
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad"
def test_simple_execution(run_component):
output_file = "output.h5mu"
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--output",
"output.h5mu",
"--methods",
"Random_Forest;Support_Vector",
]
)
# check whether file exists
assert os.path.exists(output_file), "Output file does not exist"
# read output mudata
output = mu.read_h5mu(output_file)
# check output
expected_rna_obs_cols = ["popv_prediction"]
for col in expected_rna_obs_cols:
assert col in output.mod["rna"].obs.columns, (
f"could not find columns .mod['rna'].obs['{col}']"
)
print(f"output: {output}", flush=True)
def test_popv_with_other_layer(run_component, tmp_path):
input_h5mu = mu.read(input_file)
input_h5mu.mod["rna"].layers["test"] = input_h5mu.mod["rna"].X.copy()
input_h5mu.write_h5mu(tmp_path / "input.h5mu")
run_component(
[
"--input",
tmp_path / "input.h5mu",
"--reference",
reference_file,
"--output",
"output.h5mu",
"--methods",
"Random_Forest;Support_Vector;KNN_SCANORAMA;KNN_SCVI",
]
)
def test_popv_with_non_overlapping_cells(run_component, tmp_path):
input_h5mu = mu.read(input_file)
# copy previous modalities
rna_ad = input_h5mu.mod["rna"].copy()
prot_ad = input_h5mu.mod["prot"].copy()
# change obs_names such that the cells do not overlap
rna_ad.obs_names = [f"rna_{x}" for x in rna_ad.obs_names]
prot_ad.obs_names = [f"prot_{x}" for x in prot_ad.obs_names]
# write new h5mu to file
new_h5mu = mu.MuData({"rna": rna_ad, "prot": prot_ad})
new_h5mu.write_h5mu(tmp_path / "input.h5mu")
# run component
run_component(
[
"--input",
tmp_path / "input.h5mu",
"--reference",
reference_file,
"--output",
"output.h5mu",
"--methods",
"Random_Forest;Support_Vector;KNN_SCANORAMA",
]
)
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,170 @@
name: random_forest_annotation
namespace: annotate
scope: "public"
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data to be used for cell type annotation if .X is not to be used.
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.
- name: "--sanitize_ensembl_ids"
type: boolean
description: Whether to sanitize ensembl ids by removing version numbers.
default: true
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
required: false
- name: "--reference_obs_target"
type: string
description: Key in obs field of reference modality with cell-type information.
required: true
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--reference_var_input"
type: string
required: false
description: |
.var column containing highly variable genes. By default, do not subset genes.
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_predictions"
type: string
default: random_forest_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: random_forest_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
__merge__: [., /src/base/h5_compression_argument.yaml]
- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--n_estimators"
type: integer
required: false
default: 100
description: Number of trees in the random forest.
- name: "--max_depth"
type: integer
required: false
description: |
Maximum depth of the trees in the random forest.
If not provided, the nodes are expanded until all leaves only contain a single sample.
- name: "--criterion"
type: string
required: false
choices: ["gini", "entropy", "log_loss"]
default: "gini"
description: The function to measure the quality of a split.
- name: "--class_weight"
type: string
required: false
default: "balanced_subsample"
choices: ["balanced", "balanced_subsample", "uniform"]
description: |
Weights associated with classes.
The `balanced` mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data.
The `balanced_subsample` mode is the same as `balanced` except that weights are computed based on the bootstrap sample for every tree grown.
The `uniform` mode gives all classes a weight of one.
- name: "--max_features"
type: string
default: "200"
description: |
The number of features to consider when looking for the best split. The value can either be a positive integer or one of `sqrt`, `log2` or `all`.
If integer: consider max_features features at each split.
If `sqrt`: max_features is the squareroot of all input features.
If `log2`: max_features is the log2 of all input features.
If `all`: max features equals all input features.
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/cross_check_genes.py
- path: /src/utils/subset_vars.py
- path: /src/utils/set_var_index.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- libhdf5-dev
- procps
- type: python
packages:
- scikit-learn==1.4.2
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [highcpu, highmem, highdisk]

View File

@@ -0,0 +1,162 @@
import sys
import mudata as mu
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
"output": "output.h5mu",
"input_var_gene_names": None,
"input_reference_gene_overlap": 100,
"modality": "rna",
"reference": None,
# "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
# "model": None,
"model": "resources_test/annotation_test_data/rf_model.pkl",
"reference_obs_target": "cell_ontology_class",
"reference_var_gene_names": "ensemblid",
"reference_var_input": None,
"input_layer": None,
"reference_layer": None,
"n_estimators": 100,
"criterion": "gini",
"max_depth": None,
"class_weight": None,
"max_features": 200,
"output_compression": "gzip",
"output_obs_predictions": "random_forest_pred",
"output_obs_probability": "random_forest_probability",
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
from cross_check_genes import cross_check_genes
from subset_vars import subset_vars
from set_var_index import set_var_index
logger = setup_logger()
def main():
logger.info("Reading input data")
input_mudata = mu.read_h5mu(par["input"])
input_adata = input_mudata.mod[par["modality"]]
input_modality = input_adata.copy()
input_modality = set_var_index(
input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
)
# Handle max_features parameter
max_features_conversion = {
"all": None,
"sqrt": "sqrt",
"log2": "log2",
}
try:
max_features = max_features_conversion.get(
par["max_features"], int(par["max_features"])
)
except ValueError:
raise ValueError(
f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'"
)
if (not par["model"] and not par["reference"]) or (
par["model"] and par["reference"]
):
raise ValueError(
"Make sure to provide either 'model' or 'reference', but not both."
)
if par["model"]:
logger.info("Loading a pre-trained model")
model = pickle.load(open(par["model"], "rb"))
if hasattr(model, "_feature_names_in"):
common_genes = cross_check_genes(
input_modality.var.index,
model._feature_names_in,
par["input_reference_gene_overlap"],
)
if not len(common_genes) == len(model._feature_names_in):
raise ValueError("Input dataset does not contain all model features.")
input_modality = input_modality[:, common_genes]
input_matrix = (
input_modality.layers[par["input_layer"]]
if par["input_layer"]
else input_modality.X
)
else:
logger.warning(
"Model does not have feature names saved. Could not check overlap of model's features with query genes."
)
elif par["reference"]:
logger.info("Reading reference data")
reference_mudata = mu.read_h5mu(par["reference"])
reference_modality = reference_mudata.mod[par["modality"]].copy()
reference_modality = set_var_index(
reference_modality,
par["reference_var_gene_names"],
par["sanitize_ensembl_ids"],
)
# subset to HVG if required
if par["reference_var_input"]:
reference_modality = subset_vars(
reference_modality, par["reference_var_input"]
)
# Query and input require the exact same features
common_genes = cross_check_genes(
input_modality.var.index,
reference_modality.var.index,
par["input_reference_gene_overlap"],
)
reference_modality = reference_modality[:, common_genes]
input_modality = input_modality[:, common_genes]
reference_matrix = (
reference_modality.layers[par["reference_layer"]]
if par["reference_layer"]
else reference_modality.X
)
input_matrix = (
input_modality.layers[par["input_layer"]]
if par["input_layer"]
else input_modality.X
)
logger.info("Training a model...")
labels = reference_modality.obs[par["reference_obs_target"]].to_numpy()
model = RandomForestClassifier(
n_estimators=par["n_estimators"],
criterion=par["criterion"],
max_depth=par["max_depth"],
class_weight=par["class_weight"]
if not par["class_weight"] == "uniform"
else None,
max_features=max_features,
)
model.fit(reference_matrix, labels)
model._feature_names_in = reference_modality.var.index
logger.info("Running predictions...")
predictions = model.predict(input_matrix)
probabilities = np.max(model.predict_proba(input_matrix), axis=1)
input_adata.obs[par["output_obs_predictions"]] = predictions
input_adata.obs[par["output_obs_probability"]] = probabilities
logger.info("Writing output data")
input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,240 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
from openpipeline_testutils.asserters import assert_annotation_objects_equal
from sklearn.ensemble import RandomForestClassifier
import pickle
## VIASH START
meta = {"resources_dir": "resources_test"}
sys.path.append("src/utils")
## VIASH END
input_file = (
f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
)
reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
sys.path.append(meta["resources_dir"])
from cross_check_genes import cross_check_genes
from set_var_index import set_var_index
@pytest.fixture
def dummy_model(tmp_path):
reference_modality = mu.read_h5mu(reference_file).mod["rna"].copy()
reference_modality = set_var_index(reference_modality, "ensemblid")
input_modality = mu.read_h5mu(input_file).mod["rna"].copy()
input_modality = set_var_index(input_modality, None)
common_genes = cross_check_genes(
input_modality.var.index, reference_modality.var.index
)
reference_modality = reference_modality[:, common_genes]
labels = reference_modality.obs["cell_ontology_class"].to_numpy()
model = RandomForestClassifier()
model.fit(reference_modality.X, labels)
model._feature_names_in = reference_modality.var.index
model_path = tmp_path / "model.pkl"
with open(model_path, "wb") as f:
pickle.dump(model, f)
return model_path
def test_simple_execution(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == [
"random_forest_pred",
"random_forest_probability",
]
obs_values = output_mudata.mod["rna"].obs["random_forest_probability"]
assert all(0 <= value <= 1 for value in obs_values), (
"probabilities outside the range [0, 1]"
)
def test_custom_out_obs_model_params(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--output_obs_predictions",
"dummy_pred",
"--output_obs_probability",
"dummy_probability",
"--n_estimators",
"10",
"--criterion",
"entropy",
"--max_depth",
"5",
"--class_weight",
"balanced",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == [
"dummy_pred",
"dummy_probability",
]
obs_values = output_mudata.mod["rna"].obs["dummy_probability"]
assert all(0 <= value <= 1 for value in obs_values), (
"probabilities outside the range [0, 1]"
)
def test_with_model(run_component, random_h5mu_path, dummy_model):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--model",
dummy_model,
"--output",
output_file,
"--reference_obs_target",
"cell_ontology_class",
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == [
"random_forest_pred",
"random_forest_probability",
]
obs_values = output_mudata.mod["rna"].obs["random_forest_probability"]
assert all(0 <= value <= 1 for value in obs_values), (
"probabilities outside the range [0, 1]"
)
def test_no_model_no_reference_error(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_file,
"--output",
output_file,
"--reference_obs_target",
"cell_ontology_class--reference_var_gene_names",
"ensemblid",
]
)
assert re.search(
r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
err.value.stdout.decode("utf-8"),
)
def test_model_and_reference_error(run_component, random_h5mu_path, dummy_model):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_file,
"--output",
output_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--model",
dummy_model,
]
)
assert re.search(
r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
err.value.stdout.decode("utf-8"),
)
def test_invalid_max_features(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_file,
"--output",
output_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--max_features",
"invalid_value",
]
)
assert re.search(
r"Invaldid value invalid_value for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'",
err.value.stdout.decode("utf-8"),
)
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,169 @@
name: scanvi
namespace: "annotate"
scope: "public"
description: |
scANVI () is a semi-supervised model for single-cell transcriptomics data. scANVI is an scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.
This component will instantiate a scANVI model from a pre-trained scVI model, integrate the data and perform label prediction.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ maintainer ]
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input h5mu file. Note that this needs to be the exact same dataset as the --scvi_model was trained on.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: |
Which modality from the input MuData file to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
required: false
description: "Input layer to use. If None, X is used"
- name: "--var_input"
type: string
required: false
description: ".var column containing highly variable genes that were used to train the scVi model. By default, do not subset genes."
- name: "--var_gene_names"
type: string
required: false
description: ".var column containing gene names. By default, use the index."
- name: "--obs_labels"
type: string
required: true
description: ".obs field containing the labels"
- name: "--unlabeled_category"
type: string
default: "Unknown"
description: |
Value in the --obs_labels field that indicates unlabeled observations
- name: "--sanitize_ensembl_ids"
type: boolean
description: Whether to sanitize ensembl ids by removing version numbers.
default: true
- name: scVI Model
arguments:
- name: "--scvi_model"
type: file
description: "Pretrained SCVI reference model to initialize the SCANVI model with."
example: scvi_model.pt
direction: input
required: true
- name: Outputs
arguments:
- name: "--output"
alternatives: ["-o"]
type: file
example: output.h5mu
description: Output h5mu file.
direction: output
required: true
- name: "--output_model"
type: file
description: Folder where the state of the trained model will be saved to.
required: false
direction: output
example: path/to/output_model/
- name: "--obsm_output"
type: string
default: "X_scanvi_integrated"
required: false
description: "In which .obsm slot to store the resulting integrated embedding."
- name: "--obs_output_predictions"
type: string
default: scanvi_pred
description: "In which .obs slot to store the predicted labels."
- name: "--obs_output_probabilities"
type: string
default: scanvi_proba
description: "In which. obs slot to store the probabilities of the predicted labels."
__merge__: [., /src/base/h5_compression_argument.yaml]
- name: "scANVI training arguments"
arguments:
- name: "--early_stopping"
required: false
type: boolean
description: "Whether to perform early stopping with respect to the validation set."
- name: "--early_stopping_monitor"
choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"]
default: "elbo_validation"
type: string
description: "Metric logged during validation set epoch."
- name: "--early_stopping_patience"
type: integer
min: 1
default: 45
description: "Number of validation epochs with no improvement after which training will be stopped."
- name: "--early_stopping_min_delta"
min: 0
type: double
default: 0.0
description: "Minimum change in the monitored quantity to qualify as an improvement,
i.e. an absolute change of less than min_delta, will count as no improvement."
- name: "--max_epochs"
type: integer
description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest."
required: false
- name: "--reduce_lr_on_plateau"
description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus."
type: boolean
default: True
- name: "--lr_factor"
description: "Factor to reduce learning rate."
type: double
default: 0.6
min: 0
- name: "--lr_patience"
description: "Number of epochs with no improvement after which learning rate will be reduced."
type: double
default: 30
min: 0
resources:
- type: python_script
path: script.py
- path: /src/utils/subset_vars.py
- path: /src/utils/compress_h5mu.py
- path: /src/utils/set_var_index.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/scvi_model/
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
engines:
- type: docker
image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
setup:
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
- type: python
packages:
- scvi-tools[cuda]~=1.4.1
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
# docker_run_args: ["--gpus all"]
- type: nextflow
directives:
label: [midcpu, midmem, gpu, highdisk]

View File

@@ -0,0 +1,112 @@
import mudata
import scvi
import numpy as np
### VIASH START
par = {
"input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
"modality": "rna",
"input_layer": None,
"var_input": None,
"output": "foo.h5mu",
"obsm_output": "X_scvi_integrated",
"early_stopping": True,
"early_stopping_monitor": "elbo_validation",
"early_stopping_patience": 45,
"early_stopping_min_delta": 0,
"reduce_lr_on_plateau": True,
"lr_factor": 0.6,
"lr_patience": 30,
"max_epochs": 500,
"n_obs_min_count": 10,
"n_var_min_count": 10,
"output_model": "test/",
"output_compression": "gzip",
}
meta = {"resources_dir": "src/utils"}
### VIASH END
import sys
sys.path.append(meta["resources_dir"])
from subset_vars import subset_vars
from set_var_index import set_var_index
from compress_h5mu import write_h5ad_to_h5mu_with_compression
from setup_logger import setup_logger
logger = setup_logger()
def main():
logger.info("Reading input data...")
adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"])
if par["var_input"]:
# Subset to HVG
adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy()
else:
adata_subset = adata.copy()
# Sanitize gene names and set as index of the AnnData object
adata_subset = set_var_index(
adata_subset, par["var_gene_names"], par["sanitize_ensembl_ids"]
)
logger.info(f"Loading pre-trained scVI model from {par['scvi_model']}")
scvi_model = scvi.model.SCVI.load(
par["scvi_model"],
adata_subset,
accelerator="auto",
device="auto",
)
logger.info("Instantiating scANVI model from scVI model...")
vae_uns = scvi.model.SCANVI.from_scvi_model(
scvi_model,
unlabeled_category=par["unlabeled_category"],
labels_key=par["obs_labels"],
adata=adata_subset,
)
plan_kwargs = {
"reduce_lr_on_plateau": par["reduce_lr_on_plateau"],
"lr_patience": par["lr_patience"],
"lr_factor": par["lr_factor"],
}
logger.info("Training scANVI model...")
# Train the model
vae_uns.train(
max_epochs=par["max_epochs"],
early_stopping=par["early_stopping"],
early_stopping_monitor=par["early_stopping_monitor"],
early_stopping_patience=par["early_stopping_patience"],
early_stopping_min_delta=par["early_stopping_min_delta"],
plan_kwargs=plan_kwargs,
check_val_every_n_epoch=1,
accelerator="auto",
)
# Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set
logger.info("Performing scANVI integration...")
# Get the latent output
adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation()
logger.info("Performing scANVI prediction...")
adata.obs[par["obs_output_predictions"]] = vae_uns.predict()
adata.obs[par["obs_output_probabilities"]] = np.max(
vae_uns.predict(soft=True), axis=1
)
logger.info("Writing output data...")
write_h5ad_to_h5mu_with_compression(
par["output"], par["input"], par["modality"], adata, par["output_compression"]
)
if par["output_model"]:
vae_uns.save(par["output_model"], overwrite=True)
if __name__ == "__main__":
main()

127
src/annotate/scanvi/test.py Normal file
View File

@@ -0,0 +1,127 @@
import pytest
import re
import subprocess
from pathlib import Path
import mudata
from anndata.tests.helpers import assert_equal
## VIASH START
meta = {
"executable": "./target/executable/integrate/scanvi/scanvi",
"resources_dir": "./resources_test/pbmc_1k_protein_v3/",
}
## VIASH END
import sys
sys.path.append(meta["resources_dir"])
input_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
input_file_2 = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu"
scvi_model = f"{meta['resources_dir']}/scvi_model"
def test_scanvi(run_component):
args = [
"--input",
input_file,
"--modality",
"rna",
"--obs_labels",
"cell_ontology_class",
"--scvi_model",
scvi_model,
"--output",
"output.h5mu",
"--output_model",
"scanvi_model",
"--max_epochs",
"5",
"--output_compression",
"gzip",
]
run_component(args)
input_rna = mudata.read_h5ad(input_file.strip(), mod="rna")
# check files
assert Path("output.h5mu").is_file(), "Output file does not exist"
assert Path("scanvi_model").is_dir()
assert Path("scanvi_model/model.pt").is_file()
# check output h5mu
output_data = mudata.read_h5mu("output.h5mu")
output_rna = output_data.mod["rna"]
assert output_rna.n_obs == input_rna.n_obs, (
f"Number of observations changed\noutput_data: {output_data}"
)
assert output_rna.n_vars == input_rna.n_vars, (
f"Number of variables changed\noutput_data: {output_data}"
)
assert "X_scanvi_integrated" in output_rna.obsm, (
f".obsm['X_scanvi_integrated'] not added\noutput_data: {output_data}"
)
assert "scanvi_pred" in output_rna.obs, (
f".obs['scanvi_pred'] not added\noutput_data: {output_data}"
)
assert "scanvi_proba" in output_rna.obs, (
f".obs['scanvi_proba'] not added\noutput_data: {output_data}"
)
predictions = output_data.mod["rna"].obs["scanvi_pred"]
probabilities = output_data.mod["rna"].obs["scanvi_proba"]
assert predictions.dtype == "category", (
"Calculated predictions should be category dtype"
)
assert not all(predictions.isna()), "Not all predictions should be NA"
assert probabilities.dtype == "float32", (
"Calculated probabilities should be float32 dtype"
)
assert all(0 <= value <= 1 for value in probabilities), (
".obs at celltypist_probability has values outside the range [0, 1]"
)
# assert that nothing else has changed
del output_rna.obsm["X_scanvi_integrated"]
del output_rna.obs["scanvi_pred"]
del output_rna.obs["scanvi_proba"]
assert_equal(input_rna, output_rna)
def test_raises_with_noncompatible_input_file(run_component):
args = [
"--input",
input_file_2,
"--modality",
"rna",
"--obs_labels",
"cell_ontology_class",
"--scvi_model",
scvi_model,
"--output",
"output.h5mu",
"--output_model",
"scanvi_model",
"--max_epochs",
"5",
"--output_compression",
"gzip",
]
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(args)
assert re.search(
r"ValueError: Number of vars in `adata_target` not the same as source.",
err.value.stdout.decode("utf-8"),
)
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,199 @@
name: singler
namespace: annotate
scope: "public"
description: |
SingleR performs reference-based cell type annotation for single-cell RNA-seq data
by computing Spearman correlations between test cells and reference samples with known labels,
using marker genes to assign the most similar cell type label to each new cell.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
default: "log_normalized"
description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata .var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_obs_clusters"
type: string
required: false
description: |
The name of the adata .obs column containing cluster identities of the observations.
If provided, annoation is performed on the aggregated cluster profiles,
otherwise it defaults to annotation per observation.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: true
- name: "--reference_layer"
type: string
description: The layer in the reference data containing lognormalized couns to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
required: false
- name: "--reference_obs_target"
type: string
required: true
description: The name of the adata obs column in the reference data containing cell type annotations.
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--reference_var_input"
type: string
default: "filter_with_hvg"
required: false
description: |
.var column containing a boolean mask corresponding to genes to be used for marker selection. If not provided, genes will not be subset.
- name: Arguments
description: Arguments related to the training of and classification with the SingleR model
arguments:
- name: "--de_n_genes"
type: integer
required: False
min: 1
description: |
The number of differentially expressed genes across labels to be calculated from the reference.
Defaults to 500 * (2/3) ^ log2(N) where N is the number of unique labels when if `--de_method` is set to `classic`,
otherwise, defaults to 10.
- name: "--de_method"
type: string
default: "classic"
choices: ["classic", "t", "wilcox"]
description: Method to detect differentially expressed genes between pairs of labels.
- name: "--quantile"
type: double
min: 0
max: 1
default: 0.8
description: The quantile of the correlation distribution to use to compute the score per label.
- name: "--fine_tune"
type: boolean
default: true
description: |
Whether finetuning should be performed to improve the resolution.
If set to True, an additional finetuning step is performed after initial classification,
new marker genes are calculated based on all cells with a score higher then the maximum score minus `--fine_tuning_thershold`,
and the calculation of the scores is repeated.
- name: "--fine_tuning_threshold"
type: double
default: 0.05
description: |
The maximum difference from the maximum correlation to use in fine-tuning
- name: "--prune"
type: boolean
default: true
description:
Whether label pruning should be performed.
If set to True, an additional output .obs field `--output_obs_pruned_predictions` will be added to the `--output`,
containing labels where 'low-quality' labels are replaced with NA's.
Labels are considered 'low-quality' when their delta score (stored in `--output_obs_delta_next`) fall more than 3 median absolute deviations below the median for that label type.
- name: "--sanitize_ensembl_ids"
type: boolean
description: Whether to sanitize ensembl ids by removing version numbers.
default: true
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_predictions"
type: string
default: singler_pred
description: |
In which `.obs` slot to store the predicted labels. If `--fine_tune False`, this is based only on the maximum entry in `--output_obsm_scores`.
- name: "--output_obs_probability"
type: string
default: singler_probability
description: |
In which `.obs` slots to store the probability of the predicted labels.
- name: "--output_obs_delta_next"
type: string
default: singler_delta_next
description: |
In which `.obs` slot to store the delta between the best and next-best score. If `--fine_tune True`, this is reported for scores after fine-tuning.
- name: "--output_obs_pruned_predictions"
type: string
default: singler_pruned_labels
description: |
In which `.obs` slot to store the pruned labels, where low-quality labels are replaced with NA's. Only added if `--prune True`.
- name: "--output_obsm_scores"
type: string
default: singler_scores
description: |
In which `.obsm` slot to store the matrix of prediction correlations at the specified quantile for each label (column) in each cell (row).
__merge__: [., /src/base/h5_compression_argument.yaml]
resources:
- type: r_script
path: script.R
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu
engines:
- type: docker
image: rocker/r2u:24.04
setup:
- type: docker
env:
- RETICULATE_PYTHON=/usr/bin/python
- PIP_BREAK_SYSTEM_PACKAGES=1
- type: apt
packages: [ libhdf5-dev, python3, python3-pip, python3-dev, python-is-python3 ]
- type: r
cran: [ anndata, reticulate, SingleR ]
bioc: [ scrapper, bit64 ]
- type: python
user: true
__merge__: [ /src/base/requirements/anndata_mudata.yaml ]
test_setup:
- type: python
__merge__: [ /src/base/requirements/scanpy.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, lowcpu]

View File

@@ -0,0 +1,202 @@
library(SingleR)
library(Matrix)
requireNamespace("anndata", quietly = TRUE)
requireNamespace("reticulate", quietly = TRUE)
mudata <- reticulate::import("mudata")
### VIASH START
par <- list(
input = "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
modality = "rna",
input_layer = NULL,
input_var_gene_names = "gene_symbol",
input_reference_gene_overlap = 100,
input_obs_clusters = NULL,
reference = "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
reference_layer = NULL,
reference_var_input = NULL,
reference_var_gene_names = NULL,
reference_obs_target = "cell_ontology_class",
output = "singler_output.h5mu",
output_compression = "gzip",
output_obs_predictions = "singler_labels",
output_obs_probability = "singler_proba",
output_obsm_scores = "single_r_scores",
output_obs_delta_next = "singler_delta_next",
output_obs_pruned_predictions = "singler_pruned_labels",
quantile = 0.8,
fine_tune = TRUE,
fine_tuning_threshold = 0.05,
prune = TRUE,
de_n_genes = NULL,
de_method = "classic"
)
meta <- list(
cpus = 4
)
### VIASH END
get_layer <- function(adata, layer, var_gene_names) {
# find data
data <-
if (is.null(layer)) {
adata$X
} else {
adata$layers[[layer]]
}
# check if data is available
if (is.null(data)) {
if (is.null(layer)) {
stop("No layer specified and no .X slot available in the AnnData object.")
} else {
stop(
"Requested layer '",
layer,
"' is not available in the AnnData object. Available layers: ",
paste(names(adata$layers), collapse = ", ")
)
}
}
# Set matrix dimnames
input_gene_names <- sanitize_ensembl_ids(adata, var_gene_names)
dimnames(data) <- list(adata$obs_names, input_gene_names)
# return output
data
}
sanitize_ensembl_ids <- function(adata, gene_symbol = NULL) {
if (is.null(gene_symbol)) {
gene_names <- adata$var_names
} else {
gene_names <- adata$var[[gene_symbol]]
}
# Pattern matches Ensembl IDs: starts with ENS, followed by any characters,
# then an eleven digit number, optionally followed by .version_number
ensembl_pattern <- "^(ENS.*\\d{11})(?:\\.\\d+)?$"
# Remove version numbers for ensembl ids only
sanitized <- ifelse(
grepl(ensembl_pattern, gene_names, perl = TRUE),
gsub(ensembl_pattern, "\\1", gene_names, perl = TRUE),
as.character(gene_names)
)
sanitized
}
subset_vars <- function(adata, subset_col) {
# Check if column exists
if (!subset_col %in% colnames(adata$var)) {
stop(
"Requested to use .var column '",
subset_col,
"' as a selection of genes, but the column is not available."
)
}
# Get the column
subset_values <- adata$var[[subset_col]]
# Check for NA values
if (any(is.na(subset_values))) {
stop(
"The .var column `",
subset_col,
"` contains NA values. Can not subset data."
)
}
# Check if it's logical/boolean
if (!is.logical(subset_values)) {
stop(
"Expected dtype of .var column '",
subset_col,
"' to be `logical`, but found ",
class(subset_values)[1],
". Can not subset data."
)
}
# Subset and return copy
adata_subset <- adata[, subset_values]$copy()
adata_subset
}
# Read input data
cat("Reading input file\n")
input_mdata <- mudata$read_h5mu(par$input)
input_adata <- input_mdata$mod[[par$modality]]
input_matrix <- Matrix::t(
get_layer(input_adata, par$input_layer, par$input_var_gene_names)
)
# Read reference
cat("Reading reference file\n")
ref_adata <- mudata$read_h5ad(par$reference, mod = "rna")
if (!is.null(par$reference_var_input)) {
ref_adata <- subset_vars(ref_adata, par$reference_var_input)
}
ref_matrix <- Matrix::t(
get_layer(ref_adata, par$reference_layer, par$reference_var_gene_names)
)
# Check overlap genes
cat("Checking overlap of genes between input and reference file\n")
common_ens_ids <- intersect(rownames(ref_matrix), rownames(input_matrix))
if (length(common_ens_ids) < par$input_reference_gene_overlap) {
stop(
"The intersection of genes between the query and reference is too small.\n",
paste0("Expected overlap: ", par$input_reference_gene_overlap),
paste0("Detected overlap: ", length(common_ens_ids))
)
}
# Calculate CPU cores
n_workers <- meta$cpus %||% max(1, parallel::detectCores() - 1)
# Assign target labels
if (!par$reference_obs_target %in% colnames(ref_adata$obs)) {
stop(
"Requested to use .obs column '",
par$reference_obs_target,
"' as target labels, but the column is not available."
)
} else {
target_labels <- ref_adata$obs[[par$reference_obs_target]]
}
cat("Performing SingleR cell type prediction\n")
predictions <- SingleR(
test = input_matrix,
ref = ref_matrix,
labels = target_labels,
clusters = par$input_obs_clusters,
genes = "de",
de.method = par$de_method,
de.n = par$de_n_genes,
quantile = par$quantile,
fine.tune = par$fine_tune,
tune.thresh = par$fine_tuning_threshold,
prune = par$prune,
num.threads = n_workers
)
cat("Writing output data\n")
# Writing output slots
input_adata$obs[[par$output_obs_predictions]] <-
predictions$labels
input_adata$obs[[par$output_obs_probability]] <-
apply(predictions$scores, 1, max)
input_adata$obs[[par$output_obs_delta_next]] <-
predictions$delta.next
input_adata$obs[[par$output_obs_pruned_predictions]] <-
predictions$pruned.labels
input_adata$obsm[[par$output_obsm_scores]] <-
predictions$scores
# Writing output H5MU
input_mdata$write(par$output, compression = par$output_compression)

View File

@@ -0,0 +1,143 @@
import sys
import os
import scanpy as sc
import pytest
import mudata as mu
from openpipeline_testutils.asserters import assert_annotation_objects_equal
## VIASH START
meta = {"resources_dir": "resources_test"}
## VIASH END
input_file = (
f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
)
reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
def log_normalize(adata):
adata_lognorm = adata.copy()
sc.pp.normalize_total(adata_lognorm, target_sum=1e4)
sc.pp.log1p(adata_lognorm)
adata.layers["log_normalized"] = adata_lognorm.X
return adata
@pytest.fixture
def input_path(write_mudata_to_file):
mdata = mu.read_h5mu(input_file)
adata = mdata.mod["rna"].copy()
adata_lognorm = log_normalize(adata)
mdata.mod["rna"] = adata_lognorm
return write_mudata_to_file(mdata)
def test_simple_execution(run_component, random_h5mu_path, input_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_path,
"--input_var_gene_names",
"gene_symbol",
"--reference",
reference_file,
"--reference_var_input",
"highly_variable",
"--reference_obs_target",
"cell_ontology_class",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == [
"singler_pred",
"singler_probability",
"singler_delta_next",
"singler_pruned_labels",
]
assert list(output_mudata.mod["rna"].obsm.keys()) == ["singler_scores"]
predictions = output_mudata.mod["rna"].obs["singler_pred"]
assert predictions.dtype == "category", (
"Calculated predictions should be category dtype"
)
assert predictions.notna().all(), "None of the predictions should be NA"
pruned_predictions = output_mudata.mod["rna"].obs["singler_pruned_labels"]
assert pruned_predictions.dtype == "category", (
"Pruned predictions should be category dtype"
)
assert pruned_predictions.isna().any(), (
"Some of the pruned predictions should be NA"
)
assert not all(pruned_predictions.isna()), "Not all pruned predictions should be NA"
delta_next = output_mudata.mod["rna"].obs["singler_delta_next"]
assert delta_next.dtype == "float", "Calculated delta next should be float dtype"
scores = output_mudata.mod["rna"].obsm["singler_scores"]
assert scores.dtype == "float", "Calculated scores should be float dtype"
assert all(-1 <= value <= 1 for value in scores.flatten()), (
".obsm at singler_scores has values outside the range [-1, 1]"
)
def test_params(run_component, random_h5mu_path, input_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_path,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--input_reference_gene_overlap",
"1000",
"--reference_var_gene_names",
"ensemblid",
"--reference_var_input",
"highly_variable",
"de_n_genes",
"600",
"--de_method",
"wilcox",
"--quantile",
"0.75",
"--fine_tuning_threshold",
"0.1",
"--prune",
"False",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == [
"singler_pred",
"singler_probability",
"singler_delta_next",
]
assert list(output_mudata.mod["rna"].obsm.keys()) == ["singler_scores"]
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,159 @@
name: svm_annotation
namespace: annotate
scope: "public"
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data to be used for cell type annotation if .X is not to be used.
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.
- name: "--sanitize_ensembl_ids"
type: boolean
description: Whether to sanitize ensembl ids by removing version numbers.
default: true
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
required: false
- name: "--reference_obs_target"
type: string
description:
required: true
description: |
Key in .obs attribute of reference modality with cell-type information.
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--reference_var_input"
type: string
required: false
description: |
.var column containing highly variable genes. By default, do not subset genes.
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_prediction"
type: string
default: svm_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: svm_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
__merge__: [., /src/base/h5_compression_argument.yaml]
- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--feature_selection"
type: boolean
description: "Whether to perform feature selection."
default: true
- name: "--max_iter"
type: integer
description: "Maximum number of iterations for the SVM."
min: 1
default: 5000
- name: "--c_reg"
type: double
description: "Regularization parameter for the SVM."
min: 0.0
default: 1.0
- name: "--class_weight"
type: string
description: |
"Class weights for the SVM. The `uniform` mode gives all classes a weight of one.
The `balanced` mode (default) uses the values of y to automatically adjust weights inversely
proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))"
choices: ["balanced", "uniform"]
default: "balanced"
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/cross_check_genes.py
- path: /src/utils/subset_vars.py
- path: /src/utils/set_var_index.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- libhdf5-dev
- procps
- type: python
packages:
- scikit-learn==1.5.2
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [highcpu, highmem, highdisk]

View File

@@ -0,0 +1,144 @@
import sys
import mudata as mu
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
from sklearn import svm
import pickle
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
"output": "output.h5mu",
"modality": "rna",
"reference": None,
# "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
# "model": None,
"model": "resources_test/annotation_test_data/svm_model.pkl",
"reference_obs_target": "cell_ontology_class",
"reference_var_input": None,
"input_layer": None,
"reference_layer": None,
"max_iter": 5000,
"c_reg": 1,
"class_weight": "balanced",
"output_compression": "gzip",
"input_var_gene_names": None,
"reference_var_gene_names": "ensemblid",
"output_obs_prediction": "svm_pred",
"output_obs_probability": "svm_probability",
"input_reference_gene_overlap": 100,
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
from cross_check_genes import cross_check_genes
from subset_vars import subset_vars
from set_var_index import set_var_index
logger = setup_logger()
def main():
if (not par["model"] and not par["reference"]) or (
par["model"] and par["reference"]
):
raise ValueError(
"Make sure to provide either 'model' or 'reference', but not both."
)
logger.info("Reading input data")
input_mudata = mu.read_h5mu(par["input"])
input_adata = input_mudata.mod[par["modality"]]
input_modality = input_adata.copy()
input_modality = set_var_index(
input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
)
if par["model"]:
logger.info("Loading a pre-trained model")
model = pickle.load(open(par["model"], "rb"))
if hasattr(model, "_feature_names_in"):
common_genes = cross_check_genes(
input_modality.var.index,
model._feature_names_in,
par["input_reference_gene_overlap"],
)
if not len(common_genes) == len(model._feature_names_in):
raise ValueError("Input dataset does not contain all model features.")
input_modality = input_modality[:, common_genes]
input_matrix = (
input_modality.layers[par["input_layer"]]
if par["input_layer"]
else input_modality.X
)
else:
logger.warning(
"Model does not have feature names saved. Could not check overlap of model's features with query genes."
)
elif par["reference"]:
logger.info("Reading reference data")
reference_mudata = mu.read_h5mu(par["reference"])
reference_modality = reference_mudata.mod[par["modality"]].copy()
reference_modality = set_var_index(
reference_modality,
par["reference_var_gene_names"],
par["sanitize_ensembl_ids"],
)
# subset to HVG if required
if par["reference_var_input"]:
reference_modality = subset_vars(
reference_modality, par["reference_var_input"]
)
# Query and input require the exact same features
common_genes = cross_check_genes(
input_modality.var.index,
reference_modality.var.index,
par["input_reference_gene_overlap"],
)
reference_modality = reference_modality[:, common_genes]
input_modality = input_modality[:, common_genes]
reference_matrix = (
reference_modality.layers[par["reference_layer"]]
if par["reference_layer"]
else reference_modality.X
)
input_matrix = (
input_modality.layers[par["input_layer"]]
if par["input_layer"]
else input_modality.X
)
logger.info("Training a model...")
labels = reference_modality.obs[par["reference_obs_target"]].to_numpy()
model = CalibratedClassifierCV(
svm.LinearSVC(
C=par["c_reg"],
max_iter=par["max_iter"],
class_weight=par["class_weight"]
if not par["class_weight"] == "uniform"
else None,
dual="auto",
)
)
model.fit(reference_matrix, labels)
model._feature_names_in = reference_modality.var.index
logger.info("Running predictions...")
predictions = model.predict(input_matrix)
probabilities = np.max(model.predict_proba(input_matrix), axis=1)
logger.info("Writing output data")
input_adata.obs[par["output_obs_prediction"]] = predictions
input_adata.obs[par["output_obs_probability"]] = probabilities
input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,184 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
from openpipeline_testutils.asserters import assert_annotation_objects_equal
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
import pickle
## VIASH START
meta = {"resources_dir": "resources_test"}
sys.path.append("src/utils")
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
sys.path.append(meta["resources_dir"])
from cross_check_genes import cross_check_genes
from set_var_index import set_var_index
@pytest.fixture
def dummy_model(tmp_path):
reference_modality = mu.read_h5mu(reference_file).mod["rna"].copy()
reference_modality = set_var_index(reference_modality, "ensemblid")
input_modality = mu.read_h5mu(input_file).mod["rna"].copy()
input_modality = set_var_index(input_modality, None)
common_genes = cross_check_genes(
input_modality.var.index, reference_modality.var.index
)
reference_modality = reference_modality[:, common_genes]
labels = reference_modality.obs["cell_ontology_class"].to_numpy()
model = CalibratedClassifierCV(
svm.LinearSVC(
max_iter=10,
dual="auto",
)
)
model.fit(reference_modality.X, labels)
model._feature_names_in = reference_modality.var.index
model_path = tmp_path / "model.pkl"
with open(model_path, "wb") as f:
pickle.dump(model, f)
return model_path
def test_simple_execution(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_obs_target",
"cell_ontology_class",
"--reference_var_gene_names",
"ensemblid",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ["svm_pred", "svm_probability"]
obs_values = output_mudata.mod["rna"].obs["svm_probability"]
assert all(0 <= value <= 1 for value in obs_values), (
"probabilities outside the range [0, 1]"
)
def test_custom_out_obs_model_params(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--reference",
reference_file,
"--reference_var_gene_names",
"ensemblid",
"--reference_obs_target",
"cell_ontology_class",
"--output_obs_prediction",
"dummy_pred",
"--output_obs_probability",
"dummy_probability",
"--max_iter",
"1000",
"--c_reg",
"0.1",
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == [
"dummy_pred",
"dummy_probability",
]
obs_values = output_mudata.mod["rna"].obs["dummy_probability"]
assert all(0 <= value <= 1 for value in obs_values), (
"probabilities outside the range [0, 1]"
)
def test_with_model(run_component, random_h5mu_path, dummy_model):
output_file = random_h5mu_path()
run_component(
[
"--input",
input_file,
"--reference_obs_target",
"cell_ontology_class",
"--model",
dummy_model,
"--output",
output_file,
]
)
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ["svm_pred", "svm_probability"]
obs_values = output_mudata.mod["rna"].obs["svm_probability"]
assert all(0 <= value <= 1 for value in obs_values), (
"probabilities outside the range [0, 1]"
)
def test_no_model_no_reference_error(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_file,
"--reference_obs_target",
"cell_ontology_class",
"--output",
output_file,
]
)
assert re.search(
r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
err.value.stdout.decode("utf-8"),
)
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,14 @@
name: Angela Oliveira Pisco
info:
role: Contributor
links:
github: aopisco
orcid: "0000-0003-0142-2355"
linkedin: aopisco
organizations:
- name: Insitro
href: https://insitro.com
role: Director of Computational Biology
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,11 @@
name: Dorien Roosen
info:
role: Core Team Member
links:
email: dorien@data-intuitive.com
github: dorien-er
linkedin: dorien-roosen
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist

View File

@@ -0,0 +1,11 @@
name: Dries De Maeyer
info:
role: Core Team Member
links:
email: ddemaeyer@gmail.com
github: ddemaeyer
linkedin: dries-de-maeyer-b46a814
organizations:
- name: Janssen Pharmaceuticals
href: https://www.janssen.com
role: Principal Scientist

View File

@@ -0,0 +1,12 @@
name: Dries Schaumont
info:
role: Core Team Member
links:
email: dries@data-intuitive.com
github: DriesSchaumont
orcid: "0000-0002-4389-0440"
linkedin: dries-schaumont
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist

View File

@@ -0,0 +1,6 @@
name: Elizabeth Mlynarski
info:
role: Contributor
organizations:
- name: Janssen R&D US
role: Principal Scientist Computational Genomics

View File

@@ -0,0 +1,10 @@
name: Isabelle Bergiers
info:
role: Contributor
links:
github: Isabelle-b
orcid: 0000-0001-9622-7960
organizations:
- name: Janssen Pharmaceuticals
href: https://www.janssen.com
role: Scientist OMICS Technology

View File

@@ -0,0 +1,11 @@
name: Jakub Majercik
info:
role: Contributor
links:
email: jakub@data-intuitive.com
github: jakubmajercik
linkedin: jakubmajercik
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Bioinformatics Engineer

View File

@@ -0,0 +1,15 @@
name: Kai Waldrant
info:
role: Contributor
links:
email: kai@data-intuitive.com
github: KaiWaldrant
orcid: "0009-0003-8555-1361"
linkedin: kaiwaldrant
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Bioinformatician
- name: Open Problems
href: https://openproblems.bio
role: Contributor

View File

@@ -0,0 +1,12 @@
name: Luke Zappia
info:
role: Contributor
links:
email: luke@data-intuitive.com
github: lazappi
orcid: 0000-0001-7744-8565
linkedin: lazappi
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Science Engineer

View File

@@ -0,0 +1,16 @@
name: Malte D. Luecken
info:
role: Core Team Member
links:
email: malte.luecken@helmholtz-muenchen.de
github: LuckyMD
orcid: "0000-0001-7464-7921"
linkedin: malte-l%C3%BCcken-b8b21049
twitter: MDLuecken
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: Group Leader
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,11 @@
name: Marijke Van Moerbeke
info:
role: Contributor
links:
github: mvanmoerbeke
orcid: 0000-0002-3097-5621
linkedin: marijke-van-moerbeke-84303a34
organizations:
- name: OpenAnalytics
href: https://www.openanalytics.eu
role: Statistical Consultant

View File

@@ -0,0 +1,12 @@
name: Matthias Beyens
info:
role: Contributor
links:
github: MatthiasBeyens
orcid: "0000-0003-3304-0706"
email: matthias.beyens@gmail.com
linkedin: mbeyens
organizations:
- name: Janssen Pharmaceuticals
href: https://www.janssen.com
role: Principal Scientist

View File

@@ -0,0 +1,11 @@
name: Mauro Saporita
info:
role: Contributor
links:
email: maurosaporita@gmail.com
github: mauro-saporita
linkedin: mauro-saporita-930b06a5
organizations:
- name: Ardigen
href: https://ardigen.com
role: Lead Nextflow Developer

View File

@@ -0,0 +1,11 @@
name: Povilas Gibas
info:
role: Contributor
links:
email: povilasgibas@gmail.com
github: PoGibas
linkedin: povilas-gibas
organizations:
- name: Ardigen
href: https://ardigen.com
role: Bioinformatician

View File

@@ -0,0 +1,15 @@
name: Robrecht Cannoodt
info:
role: Core Team Member
links:
email: robrecht@data-intuitive.com
github: rcannood
orcid: "0000-0003-3641-729X"
linkedin: robrechtcannoodt
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Science Engineer
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,10 @@
name: Samuel D'Souza
info:
role: Contributor
links:
github: srdsam
linkedin: samuel-d-souza-887023150/
organizations:
- name: Chan Zuckerberg Biohub
href: https://www.czbiohub.org
role: Data Engineer

View File

@@ -0,0 +1,10 @@
name: Sarah Ouologuem
info:
role: Contributor
links:
github: SarahOuologuem
orcid: 0009-0005-3398-1700
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: Student Assistant

View File

@@ -0,0 +1,10 @@
name: Toni Verbeiren
info:
role: Core Team Member
links:
github: tverbeiren
linkedin: verbeiren
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist and CEO

View File

@@ -0,0 +1,12 @@
name: Vladimir Shitov
info:
role: Contributor
links:
email: vladimir.shitov@helmholtz-muenchen.de
github: vladimirshitov
orcid: "0000-0002-1960-8812"
linkedin: vladimir-shitov-9a659513b
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: PhD Candidate

View File

@@ -0,0 +1,6 @@
name: Weiwei Schultz
info:
role: Contributor
organizations:
- name: Janssen R&D US
role: Associate Director Data Sciences

View File

@@ -0,0 +1,11 @@
name: Xichen Wu
info:
role: Contributor
links:
github: wxicu
linkedin: xichen-wu
orcid: 0009-0008-2168-4508
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: Student Assistant

View File

@@ -0,0 +1,9 @@
arguments:
- name: "--output_compression"
description: |
Compression format to use for the output AnnData and/or Mudata H5 files.
By default no compression is applied.
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"

View File

@@ -0,0 +1,4 @@
packages:
- anndata~=0.12.16
- awkward #Required for reading VDJ data stored in AIRR format
- scipy~=1.17.1 # Exclude scipy 1.17.0 because https://github.com/scverse/anndata/issues/339

View File

@@ -0,0 +1,5 @@
__merge__: [/src/base/requirements/anndata.yaml, .]
packages:
- mudata~=0.3.8
script: |
exec("try:\n import zarr; from importlib.metadata import version\nexcept ModuleNotFoundError:\n exit(0)\nelse: assert int(version(\"zarr\").partition(\".\")[0]) > 2")

View File

@@ -0,0 +1,2 @@
github:
- openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils

View File

@@ -0,0 +1,8 @@
test_setup:
- type: apt
packages:
- git
- type: python
__merge__:
- /src/base/requirements/viashpy.yaml
- /src/base/requirements/openpipeline_testutils.yaml

View File

@@ -0,0 +1,2 @@
packages:
- scanpy~=1.11.4

View File

@@ -0,0 +1,10 @@
setup:
- type: apt
packages:
- procps
- git
- type: python
__merge__:
- /src/base/requirements/anndata_mudata.yaml
- /src/base/requirements/openpipeline_testutils.yaml
- /src/base/requirements/viashpy.yaml

Some files were not shown because too many files have changed in this diff Show More