Build branch fix-integration-tests with version dev (2dbe3b72)

Build pipeline: vsh-ci-dev-k8tz4

Source commit: 2dbe3b7231

Source message: Fix pointers to test resources
This commit is contained in:
CI
2024-10-17 17:56:12 +00:00
commit cd0af18851
2125 changed files with 1018836 additions and 0 deletions

2
.gitattributes vendored Normal file
View File

@@ -0,0 +1,2 @@
src/mapping/bd_rhapsody*/*.cwl linguist-generated
src/query/cellxgene_census linguist-generated

40
.gitignore vendored Normal file
View File

@@ -0,0 +1,40 @@
# Jupyter notebooks
.ipynb_checkpoints
# pycache
*__pycache__*
.nfs*
# R related
.Rhistory
*.Rproj
.Rproj.user
# Python virtual environments
.venv
# temporary files related
temp
# NextFlow
work/
.nextflow.log
flowchart.*
.nextflow*
out/
# Macos
.DS_Store
# viash related
.viash_log*
log.txt
check_results/
out/
output*
output_log/
resources_test
/viash_tools/
# vscode
.vscode/launch.json

626
.pylintrc Normal file
View File

@@ -0,0 +1,626 @@
[MAIN]
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Load and enable all available extensions. Use --list-extensions to see a list
# all available extensions.
#enable-all-extensions=
# In error mode, messages with a category besides ERROR or FATAL are
# suppressed, and no reports are done by default. Error mode is compatible with
# disabling specific errors.
#errors-only=
# Always return a 0 (non-error) status code, even if lint errors are found.
# This is primarily useful in continuous integration scripts.
#exit-zero=
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-allow-list=
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
# for backward compatibility.)
extension-pkg-whitelist=
# Return non-zero exit code if any of these messages/categories are detected,
# even if score is above --fail-under value. Syntax same as enable. Messages
# specified are enabled, while categories only check already-enabled messages.
fail-on=
# Specify a score threshold under which the program will exit with error.
fail-under=10
# Interpret the stdin as a python script, whose filename needs to be passed as
# the module_or_package argument.
#from-stdin=
# Files or directories to be skipped. They should be base names, not paths.
ignore=CVS
# Add files or directories matching the regular expressions patterns to the
# ignore-list. The regex matches against paths and can be in Posix or Windows
# format. Because '\' represents the directory delimiter on Windows systems, it
# can't be used as an escape character.
ignore-paths=
# Files or directories matching the regular expression patterns are skipped.
# The regex matches against base names, not paths. The default value ignores
# Emacs file locks
ignore-patterns=^\.#
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use, and will cap the count on Windows to
# avoid hangs.
jobs=1
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# Minimum Python version to use for version dependent checks. Will default to
# the version used to run pylint.
py-version=3.10
# Discover python modules and packages in the file system subtree.
recursive=no
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# In verbose mode, extra non-checker-related info will be displayed.
#verbose=
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=snake_case
# Regular expression matching correct argument names. Overrides argument-
# naming-style. If left empty, argument names will be checked with the set
# naming style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=snake_case
# Regular expression matching correct attribute names. Overrides attr-naming-
# style. If left empty, attribute names will be checked with the set naming
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Bad variable names regexes, separated by a comma. If names match any regex,
# they will always be refused
bad-names-rgxs=
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style. If left empty, class attribute names will be checked
# with the set naming style.
#class-attribute-rgx=
# Naming style matching correct class constant names.
class-const-naming-style=UPPER_CASE
# Regular expression matching correct class constant names. Overrides class-
# const-naming-style. If left empty, class constant names will be checked with
# the set naming style.
#class-const-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style. If left empty, class names will be checked with the set naming style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=UPPER_CASE
# Regular expression matching correct constant names. Overrides const-naming-
# style. If left empty, constant names will be checked with the set naming
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style. If left empty, function names will be checked with the set
# naming style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,
j,
k,
ex,
Run,
_
# Good variable names regexes, separated by a comma. If names match any regex,
# they will always be accepted
good-names-rgxs=
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style. If left empty, inline iteration names will be checked
# with the set naming style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style. If left empty, method names will be checked with the set naming style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=snake_case
# Regular expression matching correct module names. Overrides module-naming-
# style. If left empty, module names will be checked with the set naming style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Regular expression matching correct type variable names. If left empty, type
# variable names will be checked with the set naming style.
#typevar-rgx=
# Naming style matching correct variable names.
variable-naming-style=snake_case
# Regular expression matching correct variable names. Overrides variable-
# naming-style. If left empty, variable names will be checked with the set
# naming style.
#variable-rgx=
[CLASSES]
# Warn about protected attribute access inside special methods
check-protected-access-in-special-methods=no
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,
_fields,
_replace,
_source,
_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls
[DESIGN]
# List of regular expressions of class ancestor names to ignore when counting
# public methods (see R0903)
exclude-too-few-public-methods=
# List of qualified class names to ignore when counting class parents (see
# R0901)
ignored-parents=
# Maximum number of arguments for function / method.
max-args=5
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=12
# Maximum number of locals for function / method body.
max-locals=15
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=6
# Maximum number of statements in function / method body.
max-statements=50
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
[EXCEPTIONS]
# Exceptions that will emit a warning when caught.
overgeneral-exceptions=BaseException,
Exception
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=100
# Maximum number of lines in a module.
max-module-lines=1000
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=
# Output a graph (.gv or any supported image format) of external dependencies
# to the given file (report RP0402 must not be disabled).
ext-import-graph=
# Output a graph (.gv or any supported image format) of all (i.e. internal and
# external) dependencies to the given file (report RP0402 must not be
# disabled).
import-graph=
# Output a graph (.gv or any supported image format) of internal dependencies
# to the given file (report RP0402 must not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[LOGGING]
# The type of string formatting that logging methods do. `old` means using %
# formatting, `new` is for `{}` formatting.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
# UNDEFINED.
confidence=HIGH,
CONTROL_FLOW,
INFERENCE,
INFERENCE_FAILURE,
UNDEFINED
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then re-enable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
line-too-long,
missing-module-docstring,
redefined-outer-name
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[METHOD_ARGS]
# List of qualified names (i.e., library.method) which require a timeout
# parameter e.g. 'requests.api.get,requests.api.post'
timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
XXX,
TODO
# Regular expression of note tags to take in consideration.
notes-rgx=
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit,argparse.parse_error
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'fatal', 'error', 'warning', 'refactor',
# 'convention', and 'info' which contain the number of messages in each
# category, as well as 'statement' which is the total number of statements
# analyzed. This score is used by the global evaluation report (RP0004).
evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
#output-format=
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[SIMILARITIES]
# Comments are removed from the similarity computation
ignore-comments=yes
# Docstrings are removed from the similarity computation
ignore-docstrings=yes
# Imports are removed from the similarity computation
ignore-imports=yes
# Signatures are removed from the similarity computation
ignore-signatures=yes
# Minimum lines number of a similarity.
min-similarity-lines=4
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. Available dictionaries: en_AG (hunspell), en_AU
# (hunspell), en_BS (hunspell), en_BW (hunspell), en_BZ (hunspell), en_CA
# (hunspell), en_DK (hunspell), en_GB (hunspell), en_GH (hunspell), en_HK
# (hunspell), en_IE (hunspell), en_IN (hunspell), en_JM (hunspell), en_MW
# (hunspell), en_NA (hunspell), en_NG (hunspell), en_NZ (hunspell), en_PH
# (hunspell), en_SG (hunspell), en_TT (hunspell), en_US (hunspell), en_ZA
# (hunspell), en_ZM (hunspell), en_ZW (hunspell).
spelling-dict=
# List of comma separated words that should be considered directives if they
# appear at the beginning of a comment and should not be checked.
spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[STRING]
# This flag controls whether inconsistent-quotes generates a warning when the
# character used as a quote delimiter is used inconsistently within a module.
check-quote-consistency=no
# This flag controls whether the implicit-str-concat should generate a warning
# on implicit string concatenation in sequences defined over several lines.
check-str-concat-over-line-jumps=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of symbolic message names to ignore for Mixin members.
ignored-checks-for-mixins=no-member,
not-async-context-manager,
not-context-manager,
attribute-defined-outside-init
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# Regex pattern to define which classes are considered mixins.
mixin-class-rgx=.*[Mm]ixin
# List of decorators that change the signature of a decorated function.
signature-mutators=
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of names allowed to shadow builtins
allowed-redefined-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io

1393
CHANGELOG.md Normal file

File diff suppressed because it is too large Load Diff

132
CODE_OF_CONDUCT.md Normal file
View File

@@ -0,0 +1,132 @@
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
[INSERT CONTACT METHOD].
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].
[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 OpenPipelines
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

13
README.md Normal file
View File

@@ -0,0 +1,13 @@
OpenPipeline
================
<!-- README.md is generated by running 'quarto render README.qmd' -->
Extensible single cell analysis pipelines for reproducible and
large-scale single cell processing using Viash and Nextflow.
The provided pipelines are built using the [Viash
framework](http://www.viash.io) on top of the nextflow workflow system.
For more information on Nextflow please visit the [Nextflow github
page](https://github.com/nextflow-io/nextflow) and the [Nextflow read
the docs page](https://www.nextflow.io/docs/latest/index.html).

18
README.qmd Normal file
View File

@@ -0,0 +1,18 @@
---
title: OpenPipeline
format: gfm
---
<!-- README.md is generated by running 'quarto render README.qmd' -->
```{r, echo = FALSE, message = FALSE, error = FALSE, warning = FALSE}
library(tidyverse)
```
Extensible single cell analysis pipelines for reproducible and large-scale single cell processing using Viash and Nextflow.
The provided pipelines are built using the [Viash framework](http://www.viash.io) on top of the
nextflow workflow system. For more information on Nextflow please visit the [Nextflow github page](https://github.com/nextflow-io/nextflow)
and the [Nextflow read the docs page](https://www.nextflow.io/docs/latest/index.html).

29
_viash.yaml Normal file
View File

@@ -0,0 +1,29 @@
viash_version: 0.9.0
version: dev
source: src
target: target
# Note: this causes the docker images to be renamed
name: openpipeline
organization: openpipelines-bio
links:
repository: https://github.com/openpipelines-bio/openpipeline
docker_registry: ghcr.io
homepage: https://openpipelines.bio
documentation: https://openpipelines.bio/fundamentals
issue_tracker: https://github.com/openpipelines-bio/openpipeline/issues
info:
test_resources:
- type: s3
path: s3://openpipelines-data
dest: resources_test
config_mods: |
.test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}
.resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}
.runners[.type == 'nextflow'].directives.tag := '$id'
.runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'

5340
images/concepts/fig.svg Normal file

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 389 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 8.9 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 9.8 KiB

View File

@@ -0,0 +1,9 @@
#!/bin/bash
# so let's do it separately
rm images/concepts/fig_*.svg
for id in cell modality_rna modality_adt modality_vdj modality_atac workflow_multiomics_rna_singlesample workflow_multiomics_rna_multisample workflow_multiomics_adt_singlesample workflow_multiomics_adt_multisample; do
inkscape --export-type="svg" --export-id="$id" --export-id-only images/concepts/fig.svg
svgo images/concepts/fig_${id}.svg
done

5
main.nf Normal file
View File

@@ -0,0 +1,5 @@
nextflow.enable.dsl=2
workflow {
print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.")
}

22
nextflow.config Normal file
View File

@@ -0,0 +1,22 @@
// template nextflow.config for nested workflows
manifest {
nextflowVersion = '!>=20.12.1-edge'
}
// TODO 1: unquote and adapt `rootDir` according to relative path within project
// params {
// rootDir = "$projectDir/../.."
// }
//
// workflowDir = "${params.rootDir}/workflows"
// targetDir = "${params.rootDir}/target/nextflow"
// TODO 2: insert custom imports here
// TODO 3: unquote
// docker {
// runOptions = "-v \$(realpath ${params.rootDir}):\$(realpath ${params.rootDir})"
// }

View File

@@ -0,0 +1,201 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_fixed
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
# dataset page:
# https://www.10xgenomics.com/datasets/mixture-of-healthy-and-cancer-ffpe-tissues-dissociated-using-miltenyi-ffpe-tissue-dissociation-kit-multiplexed-samples-4-probe-barcodes-1-standard
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.1.0/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R2_001.fastq.gz"
# download feature reference
feature_ref="$raw_dir/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv"
if [[ ! -f "$feature_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref"
fi
# download probe set
probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv"
if [[ ! -f "$probe_set" ]]; then
wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv" -O "$probe_set"
fi
sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set"
probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv"
if [[ ! -f "$probe_set_corrected" ]]; then
reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf"
cat "$probe_set" | while read line || [[ -n $line ]];
do
echo "Line: $line"
old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' )
echo "Old ID: $old_id"
if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then
echo "Just writing line"
printf "%s\n" "$line" >> "$probe_set_corrected"
else
gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;)
if [ ! -z "$gtf_lookup" ]; then
echo "Found hit"
new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
echo "New ID: $new_id"
new_line=${line/"$old_id"/"$new_id"}
printf "%s\n" "$new_line" >> "$probe_set_corrected"
else
echo "Did not find hit"
fi
fi
done
fi
# # Input FASTA:
# # >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
# # Output FASTA:
# # >chr1 1
# input_fastq="$HOME/.cache/openpipeline/GRCh38.primary_assembly.genome.fa.gz"
# fasta_modified="$TMPDIR/GRCh38.primary_assembly.genome.modified.fa"
# if [[ ! -f "$input_fastq" ]]; then
# wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" -O "$input_fastq"
# fi
# zcat "$input_fastq" \
# | sed -E 's/^>(\S+).*/>\1 \1/' \
# | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
# | sed -E 's/^>MT />chrM /' \
# > "$fasta_modified"
# pigz --fast "$fasta_modified"
# fasta_modified="$fasta_modified.gz"
# # Input GTF:
# # ... gene_id "ENSG00000223972.5"; ...
# # Output GTF:
# # ... gene_id "ENSG00000223972"; gene_version "5"; ...
# input_gtf="$HOME/.cache/openpipeline/gencode.v41.annotation.gtf.gz"
# gtf_modified="$TMPDIR/gencode.v41.annotation.gtf.modified.gtf"
# if [[ ! -f "$input_gtf" ]]; then
# wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" -O "$input_gtf"
# fi
# REGEX="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)"
# zcat "$input_gtf" \
# | sed -E 's/gene_id "'"$REGEX"'";/gene_id "\1"; gene_version "\3";/' \
# | sed -E 's/transcript_id "'"$REGEX"'";/transcript_id "\1"; transcript_version "\3";/' \
# | sed -E 's/exon_id "'"$REGEX"'";/exon_id "\1"; exon_version "\3";/' \
# > "$gtf_modified"
# pigz --fast "$gtf_modified"
# gtf_modified="$gtf_modified.gz"
final_genome="$HOME/.cache/openpipeline/GRCh38.cellranger.genome.fa.gz"
if [ ! -f "$final_genome" ]; then
NXF_VER=21.10.6 nextflow \
run . \
-main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
-profile docker \
-resume \
--id "GRCh38" \
--genome_fasta "$fasta_modified" \
--transcriptome_gtf "$gtf_modified" \
--target "cellranger" \
--output_fasta "reference.fa.gz" \
--output_gtf "reference.gtf.gz" \
--output_cellranger "GRCh38.cellranger.genome.fa.gz" \
--publish_dir "$HOME/.cache/openpipeline/"
fi
# Run mapping pipeline
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- ${orig_sample_id}_subset
library_type:
- "Gene Expression"
library_lanes:
- "any"
probe_set: "$probe_set_corrected"
gex_reference: "$genome_tar"
feature_reference: "$feature_ref"
publish_dir: "$OUT/processed"
probe_barcode_ids:
- BC001
- BC002
- BC003
- BC004
sample_ids:
- Liver_BC1
- Ovarian_BC2
- Colorectal_BC3
- Pancreas_BC4
gex_generate_bam: false
sample_force_cells:
- 5000
- -1
- -1
- -1
HERE
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config

View File

@@ -0,0 +1,149 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_anticmv
OUT=resources_test/$ID
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# dataset page:
# https://www.10xgenomics.com/resources/datasets/integrated-gex-totalseqc-and-tcr-analysis-of-connect-generated-library-from-5k-cmv-t-cells-2-standard
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/5k_human_antiCMV_T_TBNK_connect_Multiplex"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="5k_human_antiCMV_T_TBNK_connect"
seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R2_001.fastq.gz"
seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R1_001.fastq.gz"
seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R2_001.fastq.gz"
seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R2_001.fastq.gz"
# download immune panel fasta if needed
feature_reference="$raw_dir/feature_reference.csv"
if [[ ! -f "$feature_reference" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_count_feature_reference.csv" -O "$feature_reference"
fi
# download vdj reference if needed
vdj_ref="$raw_dir/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"
if [[ ! -f "$vdj_ref" ]]; then
wget "https://cf.10xgenomics.com/supp/cell-vdj/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" -O "$vdj_ref"
fi
# Run mapping pipeline
# TODO: Also include conversion to h5mu
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- "${orig_sample_id}_GEX_1_subset"
- "${orig_sample_id}_AB_subset"
- "${orig_sample_id}_VDJ_subset"
library_type:
- "Gene Expression"
- "Antibody Capture"
- "VDJ"
gex_reference: "$genome_tar"
vdj_reference: "$vdj_ref"
feature_reference: "$feature_reference"
publish_dir: "$OUT/processed"
HERE
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config \
-c src/workflows/utils/errorstrat_ignore.config
# Create h5mu
cat > /tmp/params.yaml << HERE
id: "$ID"
input: "$OUT/processed/10x_5k_anticmv.cellranger_multi.output.output"
publish_dir: "$OUT/"
output: "$orig_sample_id.h5mu"
HERE
nextflow \
run . \
-main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config
cat > /tmp/params.yaml << HERE
id: "$ID"
input: "$OUT/$orig_sample_id.h5mu"
publish_dir: "$OUT/"
output: "${orig_sample_id}_mms.h5mu"
HERE
# Run full pipeline
nextflow \
run . \
-main-script src/workflows/multiomics/full_pipeline/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config
# create fastqc directory
fastqc_dir="$OUT/fastqc"
mkdir -p "$fastqc_dir"
./target/docker/qc/fastqc/fastqc \
--input "$raw_dir" \
--mode "dir" \
--output "$fastqc_dir"

View File

@@ -0,0 +1,126 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_beam
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# dataset page:
# https://www.10xgenomics.com/datasets/5k-human-a0201-b0702-pbmcs-beam-t-2-standard
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/5k_human_A0201_B0702_PBMCs_BEAM_T"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="beamt_human_A0201_B0702_pbmc"
seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R2_001.fastq.gz"
seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R2_001.fastq.gz"
seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R2_001.fastq.gz"
# download feature reference
feature_ref="$raw_dir/beamt_human_A0201_B0702_pbmc_feature_reference.csv"
if [[ ! -f "$feature_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_count_feature_reference.csv" -O "$feature_ref"
fi
# download vdj reference if needed
vdj_ref="$raw_dir/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz"
if [[ ! -f "$vdj_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" -O "$vdj_ref"
fi
# Run mapping pipeline
# TODO: Also include conversion to h5mu
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- "${orig_sample_id}_gex_subset"
- "${orig_sample_id}_vdj_subset"
- "${orig_sample_id}_ag_subset"
library_type:
- "Gene Expression"
- "VDJ-T"
- "Antigen Capture"
gex_reference: "$genome_tar"
feature_reference: "$feature_ref"
vdj_reference: "$vdj_ref"
control_id:
- negative_control_A0201
- negative_control_B0702
mhc_allele:
- "HLA-A*02:01"
- "HLA-B*07:02"
publish_dir: "$OUT/processed"
HERE
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config
# Create h5mu
cat > /tmp/params.yaml << HERE
id: "$ID"
input: "$OUT/processed/$ID.cellranger_multi.output"
publish_dir: "$OUT/"
output: "$orig_sample_id.h5mu"
HERE
nextflow \
run . \
-main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config

View File

@@ -0,0 +1,134 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_lung_crispr
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# dataset page:
# https://www.10xgenomics.com/resources/datasets/5-k-a-549-lung-carcinoma-cells-no-treatment-transduced-with-a-crispr-pool-3-1-standard-6-0-0
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="SC3_v3_NextGem_DI_CRISPR_A549_5K"
seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R2_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R2_001.fastq.gz"
# download crispr feature reference
crispr_ref="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv"
if [[ ! -f "$crisp_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv" -O "$crispr_ref"
fi
crispr_ref_adjusted="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference_corrected.csv"
reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
cat "$crispr_ref" | while read line || [[ -n $line ]];
do
echo "Line: $line"
old_id=$( printf "%s\n" "$line" | awk -F',' '{print $7}' )
echo "Old ID: $old_id"
if [ "$old_id" = "Non-Targeting" ] || [ "$old_id" = "target_gene_id" ] ; then
echo "Just writing line"
printf "%s\n" "$line" >> "$crispr_ref_adjusted"
else
gtf_lookup=$(zgrep "$old_id" "$reference_gtf" || test $? = 1;)
if [ ! -z "$gtf_lookup" ]; then
echo "Found hit"
new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
echo "New ID: $new_id"
new_line=${line/"$old_id"/"$new_id"}
printf "%s\n" "$new_line" >> "$crispr_ref_adjusted"
else
echo "Did not find hit"
fi
fi
done
# Run mapping pipeline
# TODO: Also include conversion to h5mu
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- "${orig_sample_id}_gex_subset"
- "${orig_sample_id}_crispr_subset"
library_type:
- "Gene Expression"
- "CRISPR Guide Capture"
gex_reference: "$genome_tar"
feature_reference: "$crispr_ref_adjusted"
publish_dir: "$OUT/processed"
HERE
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config
# Create h5mu
cat > /tmp/params.yaml << HERE
id: "$ID"
input: "$OUT/processed/10x_5k_lung_crispr.cellranger_multi.output"
publish_dir: "$OUT/"
output: "$orig_sample_id.h5mu"
HERE
nextflow \
run . \
-main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config

View File

@@ -0,0 +1,81 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
ID=annotation_test_data
OUT=resources_test/$ID/
# ideally, this would be a versioned pipeline run
[ -d "$OUT" ] || mkdir -p "$OUT"
# Download Tabula Sapiens Blood reference h5ad from https://doi.org/10.5281/zenodo.7587774
wget "https://zenodo.org/record/7587774/files/TS_Blood_filtered.h5ad?download=1" -O "${OUT}/tmp_TS_Blood_filtered.h5ad"
# Download Tabula Sapiens Blood pretrained model from https://doi.org/10.5281/zenodo.7580707
wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz?download=1" -O "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
# Download PopV specific CL ontology files - needed for OnClass
# OUT_ONTOLOGY="${OUT}/ontology"
# [ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY"
# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.obo \
# -O "${OUT_ONTOLOGY}/cl.obo"
# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology \
# -O "${OUT_ONTOLOGY}/cl.ontology"
# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology.nlp.emb \
# -O "${OUT_ONTOLOGY}/cl.ontology.nlp.emb"
# Process Tabula Sapiens Blood reference h5ad
# (Select one individual and 100 cells per cell type)
python <<HEREDOC
import anndata as ad
ref_adata = ad.read_h5ad("${OUT}/tmp_TS_Blood_filtered.h5ad")
sub_ref_adata = ref_adata[ref_adata.obs["donor_assay"] == "TSP14_10x 3' v3"]
n=100
s=sub_ref_adata.obs.groupby('cell_ontology_class').cell_ontology_class.transform('count')
sub_ref_adata_final = sub_ref_adata[sub_ref_adata.obs[s>=n].groupby('cell_ontology_class').head(n).index]
# assert sub_ref_adata_final.shape == (500, 58870)
sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
HEREDOC
echo "> Converting to h5mu"
viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \
--input "${OUT}/TS_Blood_filtered.h5ad" \
--output "${OUT}/TS_Blood_filtered.h5mu" \
--modality "rna"
rm "${OUT}/tmp_TS_Blood_filtered.h5ad"
echo "> Downloading pretrained CellTypist model and sample test data"
wget https://celltypist.cog.sanger.ac.uk/models/Pan_Immune_CellTypist/v2/Immune_All_Low.pkl \
-O "${OUT}/celltypist_model_Immune_All_Low.pkl"
wget https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_2000_cells.h5ad \
-O "${OUT}/demo_2000_cells.h5ad"
viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \
--input "${OUT}/demo_2000_cells.h5ad" \
--output "${OUT}/demo_2000_cells.h5mu" \
--modality "rna"
echo "> Fetching OnClass data and models"
OUT_ONTOLOGY="${OUT}/ontology"
[ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY"
wget https://figshare.com/ndownloader/files/28394466 -O "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz"
tar -xzvf "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz" -C "${OUT_ONTOLOGY}" --strip-components=2
rm "${OUT_ONTOLOGY}/allen.ontology"
rm "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz"
wget https://figshare.com/ndownloader/files/28394541 -O "${OUT}/OnClass_models.tar.gz"
tar -xzvf "${OUT}/OnClass_models.tar.gz" -C "${OUT}" --strip-components=1
rm "${OUT}/OnClass_models.tar.gz"
rm "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
find "${OUT}/Pretrained_model" ! -name "example_file_model*" -type f -exec rm -f {} +
mv "${OUT}/Pretrained_model" "${OUT}/onclass_model"

View File

@@ -0,0 +1,8 @@
#!/bin/bash
set -eo pipefail
aws s3 sync --profile di "resources_test" "s3://openpipelines-data" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun
id=cellranger_tiny_fastq
aws s3 sync --profile di "resources_test/$id" "s3://openpipelines-data/$id" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun

View File

@@ -0,0 +1,144 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=bdrhap_5kjrt
OUT=resources_test/$ID
n_threads=30
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1"
genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/12WTA-ABC-SMK-EB-5kJRT"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/12WTA-ABC-SMK-EB-5kJRT.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
genome_dir="$raw_dir/temp_reference_gencodev41_chr1"
if [[ ! -d "$genome_dir" ]]; then
echo "> Untarring genome"
mkdir -p "$genome_dir"
tar -xvf "$genome_tar" -C "$genome_dir"
fi
# process WTA fastq files
# map to chr1, subsample chr1 reads
mapping_dir="$raw_dir/temp_mapping_chr_1"
if [[ ! -f "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" ]]; then
echo "> Processing 12WTA_S1_L432_R[12]_001.fastq.gz"
mkdir -p "$mapping_dir"
# MUST USE A STAR THAT IS COMPATIBLE WITH BD RHAPSODY
# For the cwl pipeline 1.9.1, 2.5.2b should work.
echo "star"
docker run --rm -i \
-v "`pwd`/$OUT:`pwd`/$OUT" \
-v "$tar_dir:$tar_dir" \
-w `pwd` bdgenomics/rhapsody:1.10.1 \
STAR \
--runThreadN "$n_threads" \
--genomeDir "$genome_dir" \
--readFilesIn "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" \
--runRNGseed 100 \
--outFileNamePrefix "$mapping_dir/" \
--readFilesCommand "gzip -d -k -c" \
--clip3pAdapterSeq "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \
--outFilterMatchNmin "25" \
--quantTranscriptomeBan "Singleend" # Prohibit mapping of one side of the read
# chown to current user before removing mapping dir
docker run --rm -i -v "`pwd`/$OUT:`pwd`/$OUT" -w `pwd` bdgenomics/rhapsody:1.10.1 \
chown "$(id -u):$(id -g)" --silent --recursive "$mapping_dir/"
echo "samtools"
samtools view -F 260 "$mapping_dir/Aligned.out.sam" > "$mapping_dir/primary_aligned_reads.sam"
echo "cut"
cut -f 1 "$mapping_dir/primary_aligned_reads.sam" | sort | uniq > "$mapping_dir/mapped_reads.txt"
head -500000 "$mapping_dir/mapped_reads.txt" > "$mapping_dir/mapped_reads_subset.txt"
echo "seqkit"
seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R1_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq"
seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq"
# rm -r "$mapping_dir"
# rm -r "$genome_dir"
fi
# subsample other files
smk_r1_file="$raw_dir/12SMK_S1_L432_R1_001_subset.fastq.gz"
if [[ ! -f "$smk_r1_file" ]]; then
echo "> Processing `basename $smk_r1_file`"
seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R1_001.fastq.gz" | gzip > "$smk_r1_file"
fi
smk_r2_file="$raw_dir/12SMK_S1_L432_R2_001_subset.fastq.gz"
if [[ ! -f "$smk_r2_file" ]]; then
echo "> Processing `basename $smk_r2_file`"
seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R2_001.fastq.gz" | gzip > "$smk_r2_file"
fi
abc_r1_file="$raw_dir/12ABC_S1_L432_R1_001_subset.fastq.gz"
if [[ ! -f "$abc_r1_file" ]]; then
echo "> Processing `basename $abc_r1_file`"
seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R1_001.fastq.gz" | gzip > "$abc_r1_file"
fi
abc_r2_file="$raw_dir/12ABC_S1_L432_R2_001_subset.fastq.gz"
if [[ ! -f "$abc_r2_file" ]]; then
echo "> Processing `basename $abc_r2_file`"
seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R2_001.fastq.gz" | gzip > "$abc_r2_file"
fi
wta_r1_file="$raw_dir/12WTA_S1_L432_R1_001_subset.fastq.gz"
if [[ ! -f "$wta_r1_file" ]]; then
echo "> Processing `basename $wta_r1_file`"
gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" > "$wta_r1_file"
fi
wta_r2_file="$raw_dir/12WTA_S1_L432_R2_001_subset.fastq.gz"
if [[ ! -f "$wta_r2_file" ]]; then
echo "> Processing `basename $wta_r2_file`"
gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq" > "$wta_r2_file"
fi
# copy immune panel fasta
fasta_file="$raw_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta"
if [[ ! -f "$fasta_file" ]]; then
cp "$tar_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta" "$fasta_file"
fi
genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz"
nextflow run . \
-main-script target/nextflow/workflows/ingestion/bd_rhapsody/main.nf \
-resume \
-profile docker,mount_temp \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/errorstrat_ignore.config \
--reads "$wta_r1_file;$wta_r2_file;$abc_r1_file;$abc_r2_file;$smk_r1_file;$smk_r2_file" \
--reference_archive "$genome_tar" \
--abseq_reference "$fasta_file" \
--sample_tags_version "hs" \
--tag_names "1-Jurkat;2-Ramos;3-THP1" \
--output_raw "output_raw" \
--output "output.h5mu" \
--output_state state.yaml \
--cell_calling_data "mRNA" \
--exact_cell_count 4000 \
--generate_bam true \
--publish_dir "$OUT/processed"

View File

@@ -0,0 +1,72 @@
#!/bin/bash
set -eo pipefail
# TODO: we should turn this into viash components
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=bdrhap_vdj
OUT=resources_test/$ID
n_threads=30
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/VDJDemo"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/VDJDemo/VDJDemo.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
# subset fastq files
for sample_id in RhapVDJDemo-BCR_S1_L001_R1_001 RhapVDJDemo-BCR_S1_L001_R2_001 RhapVDJDemo-mRNA_S5_L001_R1_001 RhapVDJDemo-mRNA_S5_L001_R2_001 RhapVDJDemo-TCR_S3_L001_R1_001 RhapVDJDemo-TCR_S3_L001_R2_001; do
subset_file="$raw_dir/${sample_id}_subset.fastq.gz"
if [[ ! -f "$subset_file" ]]; then
echo "> Processing $sample_id"
seqkit head -n 300000 "$tar_dir/$sample_id.fastq.gz" | gzip > "$subset_file"
fi
unset subset_file
done
# copy immune panel fasta
fasta_file="$raw_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta"
if [[ ! -f "$fasta_file" ]]; then
cp "$tar_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta" "$fasta_file"
fi
# create params file
cat > /tmp/params.yaml << HERE
param_list:
- id: "targeted_vdj"
input: "$raw_dir/RhapVDJDemo-*_S*_L001_R[12]_001_subset.fastq.gz"
mode: targeted
reference: "$fasta_file"
publish_dir: "$OUT/processed"
putative_cell_call: "mRNA"
vdj_version: human
HERE
# run bd rhapsody pipeline
nextflow \
run . \
-main-script src/workflows/ingestion/bd_rhapsody/main.nf \
-resume \
-profile docker,mount_temp \
-with-trace work/trace.txt \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels.config \
-c src/workflows/utils/errorstrat_ignore.config

View File

@@ -0,0 +1,74 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
# settings
ID=cellranger_atac_tiny_bcl
OUT="resources_test/$ID/"
DIR="$OUT"
REFERENCE_DIR=resources_test/reference_gencodev41_chr1
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up EXIT
viash ns build -q "download_file|cellranger_atac_mkfastq|build_cellranger_arc_reference|cellranger_atac_count" -p docker --setup cb
# download bcl data
if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
mkdir -p "$OUT/bcl"
# download tar gz
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-1.0.0.tar.gz \
--output "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz"
# untar
tar -xf "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz" \
--strip-components=1 \
-C "$OUT/bcl"
# remove tar
rm "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz"
# Download the layout file. It contains info about the samples (1 in this case) and lanes
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-simple-1.0.0.csv \
--output "${OUT}/bcl/layout.csv"
# download sample sheet
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-samplesheet-1.0.0.csv \
--output "${OUT}/bcl/sample_sheet.csv"
fi
if [ ! -d "${OUT}/fastqs" ]; then
mkdir -p "$OUT/fastqs"
target/docker/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq \
--input "${OUT}/bcl" \
--csv "${OUT}/bcl/layout.csv" \
--output "${OUT}/fastqs"
fi
# Create count matrices
if [ ! -d "${OUT}/counts" ]; then
mkdir -p "$OUT/counts"
target/docker/mapping/cellranger_atac_count/cellranger_atac_count \
--input "${OUT}/fastqs/HJN3KBCX2/test_sample/" \
--reference "${REFERENCE_DIR}/reference_cellranger.tar.gz" \
--output "${OUT}/counts"
fi

View File

@@ -0,0 +1,112 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
# settings
ID=cellranger_tiny_bcl
OUT="resources_test/$ID/"
DIR="$OUT"
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up EXIT
# download bcl data
if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then
mkdir -p "$OUT/bcl"
# download tar gz
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-1.2.0.tar.gz \
--output "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz"
# untar
tar -xf "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz" \
--strip-components=1 \
-C "$OUT/bcl"
# remove tar
rm "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz"
# download sample sheet
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-simple-1.2.0.csv \
--output "${OUT}/bcl/sample_sheet.csv"
fi
if [ ! -d "${OUT}/fastqs" ]; then
mkdir -p "$OUT/fastqs"
target/docker/demux/cellranger_mkfastq/cellranger_mkfastq \
--input "${OUT}/bcl" \
--sample_sheet "${OUT}/bcl/sample_sheet.csv" \
--output "${OUT}/fastqs"
fi
# bcl-convert requires a v2 sample sheet
# bcl-convert is a bit more strict concerning filter files being present or not.
# We make a copy and make the necessary adaptations.
# We are using the tiny bcl dataset provided by Illumina:
# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq
# Unfortunately,
# 1. the sample sheet delivered with it does not work with bcl-convert (v1 of the format)
# 2. 2 filter files are missing from the run directory that bcl-convert requires to run
#
# We worked around this by
# 1. Manually editing a sample sheet file suited for bcl-convert (format v2)
# 2. Adding a filter file
#
# The filter file is a binary file, we just created an empty file use that.
# bcl-convert might complain about it, but at least something is written out.
# An alternative is to use a filter file from a different project. This also generates
# a warning, but the fastq ouput files contain reads. The drawback is that those filter files
# are generally above 100MB in size.
#
# TODO: Check if a (binary) filter file can be generated that is small but works.
if [ ! -f "${OUT}/bcl2/sample_sheet.csv" ]; then
mkdir "${OUT}/bcl2/"
cp -r ${OUT}/bcl/* "${OUT}/bcl2/"
cat > "${OUT}/bcl2/sample_sheet.csv" << HERE
[Header],,,,,,,,,
FileFormatVersion,2,,,,,,
RunName,hiseq_test,,,,,,
InstrumentPlatform,NextSeq,,,,,,
IndexOrientation,Forward,,,,,,
,,,,,,,,,
[Reads],,,,,,,,,
Read1Cycles,26,,,,,,,,,
Read2Cycles,98,,,,,,
,,,,,,,,,
[Sequencing_Settings],,,,,,,
,,,,,,,
[BCLConvert_Settings],,,,,,,
SoftwareVersion,3.8.4,,,,,,
NoLaneSplitting,true,,,,,,
FastqCompressionFormat,gzip,,,,,,
,,,,,,,,,
[BCLConvert_Data],,,,,,,
Sample_ID,index,,,,,,
s1,GGTTTACT,,,,,,
,,,,,,,
[Cloud_Settings],,,,,,,
GeneratedVersion,1.3.0.202111171923,,,,,,
,,,,,,,
[Cloud_Data],,,,,,,
Sample_ID,ProjectName,LibraryName,LibraryPrepKitName,IndexAdapterKitName,I7_Index_ID,Sample_Name,Description,Instrument,Type
s1,p1,s1_SI-P03-C9,,,IDT01,SI-P03-C9,s1,NextSeq,HighOutput_75cycles
HERE
touch "${OUT}/bcl2/Data/Intensities/BaseCalls/L001/s_1_1101.filter"
fi

View File

@@ -0,0 +1,118 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
# settings
ID=cellranger_tiny_fastq
OUT="resources_test/$ID/"
DIR="$OUT"
# download cellranger tar gz
cellranger_tar_gz="${OUT}/temp_cellranger-6.1.2.tar.gz"
if [ ! -f "$cellranger_tar_gz" ]; then
echo "Download Cell Ranger 6.1.2 manually first!"
exit 1
fi
# untar fastqs
cellranger_tiny_fastq="${OUT}/cellranger_tiny_fastq"
if [ ! -f "${cellranger_tiny_fastq}/tinygex_S1_L001_R1_001.fastq.gz" ]; then
mkdir -p "$cellranger_tiny_fastq"
tar -xzf "$cellranger_tar_gz" \
-C "$cellranger_tiny_fastq" \
"cellranger-6.1.2/external/cellranger_tiny_fastq" \
--strip-components=3
fi
# untar ref
cellranger_tiny_ref="${OUT}/cellranger_tiny_ref"
if [ ! -f "${cellranger_tiny_ref}/reference.json" ]; then
mkdir -p "$cellranger_tiny_ref"
tar -xzf "$cellranger_tar_gz" \
-C "$cellranger_tiny_ref" \
"cellranger-6.1.2/external/cellranger_tiny_ref" \
--strip-components=3
fi
# Create ref with more recent STAR version
recent_ref_dir="${OUT}/cellranger_tiny_ref_v2_7_10_a"
if [ ! -f "${recent_ref_dir}/Genome" ]; then
mkdir -p "${recent_ref_dir}"
target/docker/mapping/star_build_reference/star_build_reference \
--genome_fasta "$cellranger_tiny_ref/fasta/genome.fa" \
--output "$recent_ref_dir" \
--genomeSAindexNbases 7 \
--transcriptome_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz"
fi
# run cellranger count
bam_dir="${OUT}/bam"
if [ ! -f "$bam_dir/possorted_genome_bam.bam" ]; then
mkdir -p "$bam_dir"
viash run src/mapping/cellranger_count/config.vsh.yaml -- \
--input "$cellranger_tiny_fastq" \
--reference "$cellranger_tiny_ref" \
--output "$bam_dir"
fi
# convert to h5mu
raw_h5mu="${OUT}/raw_dataset.h5mu"
if [ ! -f "$step1_h5mu" ]; then
viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
--input "${bam_dir}/raw_feature_bc_matrix.h5" \
--output "$raw_h5mu"
fi
# run velocyto
velo_gtf="$cellranger_tiny_ref/genes/genes.gtf.gz"
velo_bam="$bam_dir/possorted_genome_bam.bam"
velo_loom="${OUT}/velocyto.loom"
if [ ! -f "$velo_loom" ]; then
viash run src/velocity/velocyto/config.vsh.yaml -- \
--input "$velo_bam" \
--output "$velo_loom" \
--transcriptome "$velo_gtf"
fi
# combine raw counts with velocyto data
dataset_h5mu="${OUT}/dataset.h5mu"
if [ ! -f "$dataset_h5mu" ]; then
viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \
--input_loom "$velo_loom" \
--input_h5mu "$raw_h5mu" \
--output "$dataset_h5mu"
fi
# run htseq
htseq_counts="${OUT}/htseq_counts.tsv"
if [ ! -f "$htseq_counts" ]; then
viash run src/mapping/htseq_count/config.vsh.yaml -- \
--input "$velo_bam" \
--reference "$velo_gtf" \
--output "$htseq_counts"
fi
multi_star="${OUT}/multi_star"
if [ ! -d "$multi_star" ]; then
viash run src/mapping/multi_star/config.vsh.yaml -- \
--input_id "tinygex" \
--input_r1 "$cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz" \
--input_r2 "$cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz" \
--input_id "tinygex" \
--input_r1 "$cellranger_tiny_fastq/tinygex_S1_L002_R1_001.fastq.gz" \
--input_r2 "$cellranger_tiny_fastq/tinygex_S1_L002_R2_001.fastq.gz" \
--reference_index "$recent_ref_dir" \
--reference_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz" \
--output "$multi_star" \
---cpus 30
fi

View File

@@ -0,0 +1,72 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
# The output folder
OUT="resources_test/concat_test_data/"
# create it if it doesn't exist already
[ -d "$OUT" ] || mkdir -p "$OUT"
echo "> Downloading files"
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/e18_mouse_brain_fresh_5k/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5 \
--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5"
target/docker/download/download_file/download_file \
--input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/human_brain_3k/human_brain_3k_filtered_feature_bc_matrix.h5 \
--output "${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5"
echo "> Converting to h5mu"
viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \
--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu"
viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \
--input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5" \
--output "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu"
echo "> Subsetting datasets"
viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
--input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu" \
--output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
--number_of_observations 2000
viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \
--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
--number_of_observations 2000
echo "Making observation ids unique (required for concat component to function)"
viash run src/metadata/add_id/config.vsh.yaml -- \
--input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
--output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
--input_id "human" \
--make_observation_keys_unique
viash run src/metadata/add_id/config.vsh.yaml -- \
--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
--input_id "mouse" \
--make_observation_keys_unique
echo "Removing temp files"
rm "${OUT}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \
"$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \
"$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \
"$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \
"${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5mu" \
"$OUT/human_brain_3k_filtered_feature_bc_matrix.h5"
echo "> Running concat component"
viash run src/dataflow/concat/config.vsh.yaml -- \
--input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu,$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \
--input_id "human,mouse" \
--output "$OUT/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"

View File

@@ -0,0 +1,44 @@
#!/bin/bash
set -eo pipefail
# settings
ID=demuxafy_test_data
OUT=resources_test/$ID
DIR="$OUT"
mkdir -p "$OUT"
cd "$OUT"
# download demuxafy test dataset
wget https://www.dropbox.com/s/m8u61jn4i1mcktp/TestData4PipelineSmall.tar.gz
tar -xf TestData4PipelineSmall.tar.gz
# bam and vcf file
cp TestData4PipelineSmall/test_dataset/outs/pooled.sorted.bam.bai .
cp TestData4PipelineSmall/test_dataset/outs/pooled.sorted.bam .
cp TestData4PipelineSmall/test_dataset.vcf .
# extract chr from vcf file
grep -w '^#\|^#CHROM\|^[1-2]' test_dataset.vcf > test_dataset_chr1_2.vcf
grep -w '^#\|^#CHROM\|^[3-4]' test_dataset.vcf > test_dataset_chr3_4.vcf
# barcode list
cp TestData4PipelineSmall/test_dataset/outs/filtered_gene_bc_matrices/Homo_sapiens_GRCh38p10/barcodes.tsv .
# subsetted bam and bai for souporcell
wget https://www.dropbox.com/s/7ew5lt0msf4z5gj/chr_1_pooled.sorted.bam
wget https://www.dropbox.com/s/tpplbj9sab9b2p4/chr_1_pooled.sorted.bam.bai
# variants from mixed sample
wget https://www.dropbox.com/s/btir7ge4kzc7tu1/mixed_variant.vcf
# dsc_pileup output
wget https://www.dropbox.com/s/17hj9i0yavtezx1/dsc_pileup.zip
unzip dsc_pileup.zip
# subsetted human genome reference
wget https://www.dropbox.com/s/ynlce3g7nwxthwg/genome_chr1.fa
# remove unnecessary files
rm -rf TestData4PipelineSmall
rm TestData4PipelineSmall.tar.gz
rm dsc_pileup.zip

View File

@@ -0,0 +1,58 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
ID=HLCA_reference_model
OUT=resources_test/$ID/$ID
DIR=$(dirname "$OUT")
# ideally, this would be a versioned pipeline run
[ -d "$DIR" ] || mkdir -p "$DIR"
# download and unarchive pre-trained scANVI model
wget https://zenodo.org/record/6337966/files/HLCA_reference_model.zip \
-O "${OUT}.zip"
# # Test query data
# # Source publication: Delorey, Toni M., et al. “COVID-19 tissue atlases reveal SARS-CoV-2 pathology and cellular targets.” Nature 595.7865 (2021): 107-113.
# wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5230nnn/GSM5230027/suppl/GSM5230027_04-P103142-S149-R01_raw_feature_bc_matrix.h5.gz \
# -O "${OUT}_query_test.h5.gz"
# gzip -d "${OUT}_query_test.h5.gz"
# # Prepare test data as in scvi-tools tutorial: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/query_hlca_knn.html
# python <<HEREDOC
# import pandas as pd
# import scanpy as sc
# geo_metadata_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE171nnn/GSE171668/suppl/GSE171668_lung_metadata.csv.gz"
# metadata = pd.read_csv(geo_metadata_url, index_col=0)
# DATA_PATH = "${OUT}_query_test.h5"
# query_data = sc.read_10x_h5(DATA_PATH)
# # clean up .var.index (gene names)
# query_data.var['gene_names'] = query_data.var.index
# query_data.var.index = [idx.split("___")[-1] for idx in query_data.var.gene_ids]
# # clean up cell barcodes:
# query_data.obs.index = query_data.obs.index.str.rstrip("-1")
# # read in metadata (to select only cells of interest and remove empty drops)
# # subset to cells from our sample
# metadata = metadata.loc[metadata.donor == "D12_4",:].copy()
# # clean up barcodes:
# metadata.index = [idx.split("-")[-1] for idx in metadata.index]
# # subset adata to cells in metadata:
# query_data = query_data[metadata.index,:].copy()
# # add dataset information:
# query_data.obs['dataset'] = "test_dataset_delorey_regev"
# sc.write(DATA_PATH, query_data)
# HEREDOC
# # convert 10x h5 to h5mu
# viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml -- \
# --input "${OUT}_query_test.h5" \
# --output "${OUT}_query_test.h5mu"

View File

@@ -0,0 +1,16 @@
#!/bin/bash
# settings
ID=merge_test_data
OUT=resources_test/$ID
DIR="$OUT"
mkdir -p "$OUT"
target/docker/dataflow/split_modalities/split_modalities \
--input resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu \
--output "$OUT"

View File

@@ -0,0 +1,119 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
ID=pbmc_1k_protein_v3
OUT=resources_test/$ID/$ID
DIR=$(dirname "$OUT")
# ideally, this would be a versioned pipeline run
[ -d "$DIR" ] || mkdir -p "$DIR"
# dataset page:
# https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-gene-expression-and-cell-surface-protein-3-standard-3-0-0
# download metrics summary
wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_metrics_summary.csv \
-O "${OUT}_metrics_summary.csv"
# download counts h5 file
wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5 \
-O "${OUT}_filtered_feature_bc_matrix.h5"
wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \
-O "${OUT}_raw_feature_bc_matrix.h5"
# download counts matrix tar gz file
wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.tar.gz \
-O "${OUT}_filtered_feature_bc_matrix.tar.gz"
# extract matrix tar gz
mkdir -p "${OUT}_filtered_feature_bc_matrix"
tar -xvf "${OUT}_filtered_feature_bc_matrix.tar.gz" \
-C "${OUT}_filtered_feature_bc_matrix" \
--strip-components 1
rm "${OUT}_filtered_feature_bc_matrix.tar.gz"
# convert 10x h5 to h5mu
target/docker/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu \
--input "${OUT}_filtered_feature_bc_matrix.h5" \
--input_metrics_summary "${OUT}_metrics_summary.csv" \
--output "${OUT}_filtered_feature_bc_matrix.h5mu"
# run single sample
nextflow \
run . \
-main-script target/nextflow/workflows/rna/rna_singlesample/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_uss \
--input "${OUT}_filtered_feature_bc_matrix.h5mu" \
--output "`basename $OUT`_uss.h5mu" \
--publishDir `dirname $OUT` \
-resume
# add the sample ID to the mudata object
nextflow \
run . \
-main-script target/nextflow/metadata/add_id/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_uss \
--input "${OUT}_uss.h5mu" \
--input_id "pbmc_1k_protein_v3_uss" \
--output "`basename $OUT`_uss_with_id.h5mu" \
--output_compression "gzip" \
--publishDir `dirname $OUT` \
-resume
# run multisample
nextflow \
run . \
-main-script target/nextflow/workflows/rna/rna_multisample/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_ums \
--input "${OUT}_uss_with_id.h5mu" \
--output "`basename $OUT`_ums.h5mu" \
--publishDir `dirname $OUT` \
-resume
rm "${OUT}_uss_with_id.h5mu"
# run dimred
nextflow \
run . \
-main-script target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_mms \
--input "${OUT}_ums.h5mu" \
--output "`basename $OUT`_mms.h5mu" \
--publishDir `dirname $OUT` \
--obs_covariates sample_id \
-resume
# run integration
nextflow \
run . \
-main-script target/nextflow/workflows/integration/harmony_leiden/main.nf \
-c src/workflows/utils/labels_ci.config \
-profile docker \
--id pbmc_1k_protein_v3_mms_integration \
--input "${OUT}_mms.h5mu" \
--output "`basename $OUT`_mms.h5mu" \
--publishDir `dirname $OUT` \
--obs_covariates sample_id \
-resume
python <<HEREDOC
import mudata as mu
mudata = mu.read_h5mu("${DIR}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu")
mudata.mod["rna"].write_h5ad("${DIR}/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad")
HEREDOC

View File

@@ -0,0 +1,61 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=reference_gencodev41_chr1
OUT=resources_test/$ID
mkdir -p "$OUT"
wget "https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" -O "$OUT/ERCC92.zip"
# Download JASPAR files for reference building
# Source of the code below: https://support.10xgenomics.com/single-cell-atac/software/release-notes/references#GRCh38-2020-A-2.0.0
motifs_url="https://jaspar.elixir.no/download/data/2024/CORE/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
motifs_in="${OUT}/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt"
if [ ! -f "$motifs_in" ]; then
curl -sS "$motifs_url" > "$motifs_in"
fi
# Change motif headers so the human-readable motif name precedes the motif
# identifier. So ">MA0004.1 Arnt" -> ">Arnt_MA0004.1".
motifs_modified="${OUT}/$(basename "$motifs_in").modified"
awk '{
if ( substr($1, 1, 1) == ">" ) {
print ">" $2 "_" substr($1,2)
} else {
print
}
}' "$motifs_in" > "$motifs_modified"
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
genome_fasta: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz"
transcriptome_gtf: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz"
target: ["bd_rhapsody", "cellranger_arc"]
output_fasta: "reference.fa.gz"
output_gtf: "reference.gtf.gz"
non_nuclear_contigs: null
output_cellranger_arc: "reference_cellranger.tar.gz"
output_bd_rhapsody: "reference_bd_rhapsody.tar.gz"
bdrhap_extra_star_params: "--genomeSAindexNbases 12 --genomeSAsparseD 2"
motifs_file: "$motifs_modified"
subset_regex: "chr1"
HERE
nextflow \
run . \
-main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
-profile docker \
-c ./src/workflows/utils/labels_ci.config \
-params-file /tmp/params.yaml \
--publish_dir $OUT \
-resume

View File

@@ -0,0 +1,51 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
mkdir -p "resources_test/remote_param_list/"
OUT=resources_test/remote_param_list/test_param_list.yaml
OUT_CSV=resources_test/remote_param_list/test_param_list.csv
OUT_JSON=resources_test/remote_param_list/test_param_list.json
cat > $OUT << HERE
- id: "mouse"
input: s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
publish_dir: "foo_remote/"
rna_min_counts: 2
prot_min_counts: 3
- id: "human"
input: s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu
publish_dir: "foo_remote/"
rna_min_counts: 2
prot_min_counts: 3
HERE
cat > $OUT_CSV << EOF
"id","input","publish_dir","rna_min_counts","prot_min_counts"
"mouse","s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3"
"human","s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3"
EOF
cat > $OUT_JSON << HERE
[
{
"id": "mouse",
"input": "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu",
"publish_dir": "foo_remote/",
"rna_min_counts": 2,
"prot_min_counts": 3
},
{
"id": "human",
"input": "s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu",
"publish_dir": "foo_remote/",
"rna_min_counts": 2,
"prot_min_counts": 3
}
]
HERE

View File

@@ -0,0 +1,54 @@
#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=rna_velocity
OUT=resources_test/$ID
# create raw directory
velocyto_dir="$OUT/velocyto"
mkdir -p "$velocyto_dir"
########################################################
# Create a compatible BAM file from BD Rhapsody Output #
########################################################
bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/WTA.bd_rhapsody.output_raw/sample_final.BAM"
if [[ ! -f "$bd_rhap_wta_bam" ]]; then
echo "$bd_rhap_wta_bam does not exist. Please generate BD Rhapsody test data first."
exit 1
fi
echo "> Converting BD Rhapsody barcode tags."
viash run src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml -- \
-i "$bd_rhap_wta_bam" \
-o "$velocyto_dir/compatible_bd_input.bam" \
--bam \
-t 4
echo "> Creating barcodes file."
samtools view -@4 "$velocyto_dir/compatible_bd_input.bam" | \
grep -oP "(?<=CB:Z:)\S+" | sort | uniq | head > "$velocyto_dir/barcodes.txt"
###########################################################
# Process Tiny Fast Fastq dataset from 10X to create #
# input data for convert/from_velocyto_to_h5mu compontent #
###########################################################
mkdir "$OUT/velocyto_processed"
gtf="resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz"
bam="resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam"
echo "> Processing 10x dataset"
viash run src/velocity/velocyto/config.vsh.yaml -- \
-i "$bam" \
-o "$OUT/velocyto_processed/cellranger_tiny.loom" \
--transcriptome "$gtf"

View File

@@ -0,0 +1,111 @@
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=scgpt
OUT=resources_test/$ID
# create foundational model directory
foundation_model_dir="$OUT/source"
mkdir -p "$foundation_model_dir"
# install gdown if necessary
# Check whether gdown is available
if ! command -v gdown &> /dev/null; then
echo "This script requires gdown. Please make sure the binary is added to your PATH."
exit 1
fi
echo "> Downloading scGPT foundation model (full_human)"
# download foundational model files (full_human)
# https://drive.google.com/drive/folders/1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y
gdown '1H3E_MJ-Dl36AQV6jLbna2EdvgPaqvqcC' -O "${foundation_model_dir}/vocab.json"
gdown '1hh2zGKyWAx3DyovD30GStZ3QlzmSqdk1' -O "${foundation_model_dir}/args.json"
gdown '14AebJfGOUF047Eg40hk57HCtrb0fyDTm' -O "${foundation_model_dir}/best_model.pt"
# create test data dir
test_resources_dir="$OUT/test_resources"
mkdir -p "$test_resources_dir"
echo "> Downloading test resources"
# download test data
# https://drive.google.com/file/d/1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL/view?usp=drive_link
gdown '1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL' -O "${test_resources_dir}/Kim2020_Lung.h5ad"
echo "> Converting to h5mu"
python <<HEREDOC
import anndata as ad
import mudata as mu
input_adata = ad.read_h5ad("${test_resources_dir}/Kim2020_Lung.h5ad")
input_mdata = mu.MuData({'rna': input_adata})
input_mdata.write_h5mu("${test_resources_dir}/Kim2020_Lung.h5mu")
HEREDOC
echo "> Subsetting datasets"
viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
--number_of_observations 4000
rm "${test_resources_dir}/Kim2020_Lung.h5ad"
echo "> Preprocessing datasets"
nextflow \
run . \
-main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
-profile docker \
-c src/workflows/utils/labels_ci.config \
--input "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
--output "Kim2020_Lung_subset_preprocessed.h5mu" \
--publish_dir "${test_resources_dir}"
echo "> Filtering highly variable features"
viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/iKim2020_Lung_subset_preprocessed.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
--layer "log_normalized" \
--var_name_filter "filter_with_hvg" \
--n_top_features 1200 \
--flavor "seurat_v3"
viash run src/filter/do_filter/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
--var_filter "filter_with_hvg"
echo "> Running scGPT cross check genes"
viash run src/scgpt/cross_check_genes/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
--vocab_file "${foundation_model_dir}/vocab.json"
echo "> Running scGPT binning"
viash run src/scgpt/binning/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
--input_layer "log_normalized" \
--output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu"
echo "> Running scGPT tokenizing"
viash run src/scgpt/pad_tokenize/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
--input_layer "binned" \
--output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
--model_vocab "${foundation_model_dir}/vocab.json"
echo "> Running scGPT integration"
viash run src/scgpt/embedding/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset_scgpt_integrated.h5mu" \
--model "${foundation_model_dir}/best_model.pt" \
--model_vocab "${foundation_model_dir}/vocab.json" \
--model_config "${foundation_model_dir}/args.json" \
--obs_batch_label "sample"
echo "> Removing unnecessary files in test resources dir"
find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete
echo "> scGPT test resources are ready!"

View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -eo pipefail
# settings
ID=vireo_test_data
OUT=resources_test/$ID
DIR="$OUT"
mkdir -p "$OUT"
cd "$OUT"
# download vireo tutorial dataset
wget https://github.com/single-cell-genetics/vireo/raw/master/data/cells.cellSNP.vcf.gz

1
schemas/author.yaml Normal file
View File

@@ -0,0 +1 @@
$ref: "defs_common.yaml#/definitions/Author"

395
schemas/defs_common.yaml Normal file
View File

@@ -0,0 +1,395 @@
definitions:
Config:
description: "A Viash configuration is a YAML file which contains metadata to\
\ describe the behaviour and build target(s) of a component. \nWe commonly\
\ name this file `config.vsh.yaml` in our examples, but you can name it however\
\ you choose. \n"
type: "object"
properties:
label:
description: "A clean version of the component's name. This is only used for\
\ documentation."
type: "string"
license:
description: "The license of the package."
type: "string"
authors:
description: "A list of authors. An author must at least have a name, but\
\ can also have a list of roles, an e-mail address, and a map of custom\
\ properties.\n\nSuggested values for roles are:\n \n| Role | Abbrev. |\
\ Description |\n|------|---------|-------------|\n| maintainer | mnt |\
\ for the maintainer of the code. Ideally, exactly one maintainer is specified.\
\ |\n| author | aut | for persons who have made substantial contributions\
\ to the software. |\n| contributor | ctb| for persons who have made smaller\
\ contributions (such as code patches).\n| datacontributor | dtc | for persons\
\ or organisations that contributed data sets for the software\n| copyrightholder\
\ | cph | for all copyright holders. This is a legal concept so should use\
\ the legal name of an institution or corporate body.\n| funder | fnd |\
\ for persons or organizations that furnished financial support for the\
\ development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\
\ is extremely comprehensive.\n"
type: "array"
items:
type: object
properties:
__merge__:
type: string
pattern: "^/src/authors/.*\\.yaml$"
roles:
description: |
Role of the author. Possible values:
* `"author"`: Authors who have made substantial contributions to the component.
* `"maintainer"`: The maintainer of the component.
* `"contributor"`: Authors who have made smaller contributions (such as code patches etc.).
type: array
items:
enum: [maintainer, author, contributor]
status:
description: "Allows setting a component to active, deprecated or disabled."
$ref: "defs_viash.yaml#/definitions/Status"
requirements:
description: "Computational requirements related to running the component.\
\ \n`cpus` specifies the maximum number of (logical) cpus a component is\
\ allowed to use., whereas\n`memory` specifies the maximum amount of memory\
\ a component is allowed to allicate. Memory units must be\nin B, KB, MB,\
\ GB, TB or PB for SI units (1000-base), or KiB, MiB, GiB, TiB or PiB for\
\ binary IEC units (1024-base)."
$ref: "defs_viash.yaml#/definitions/ComputationalRequirements"
repositories:
description: "(Pre-)defines repositories that can be used as repository in\
\ dependencies.\nAllows reusing repository definitions in case it is used\
\ in multiple dependencies."
type: "array"
items:
$ref: "defs_viash.yaml#/definitions/RepositoryWithName"
dependencies:
description: "Allows listing Viash components required by this Viash component"
type: "array"
items:
$ref: "defs_viash.yaml#/definitions/Dependency"
summary:
description: "A one-sentence summary of the component. This is only used for\
\ documentation."
type: "string"
runners:
description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\
\ - NextflowRunner\n"
type: "array"
items:
$ref: "defs_viash.yaml#/definitions/Runner"
name:
description: "Name of the component and the filename of the executable when\
\ built with `viash build`."
type: "string"
argument_groups:
description: "A grouping of the arguments, used to display the help message.\n\
\n - `name: foo`, the name of the argument group. \n - `description: Description\
\ of foo`, a description of the argument group. Multiline descriptions are\
\ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\
\n"
type: "array"
items:
$ref: "defs_viash.yaml#/definitions/ArgumentGroup"
description:
description: "A description of the component. This is only used for documentation.\
\ Multiline descriptions are supported."
type: "string"
usage:
description: "A description on how to use the component. This will be displayed\
\ with `--help` under the 'Usage:' section."
type: "string"
info:
description: "Structured information. Can be any shape: a string, vector,\
\ map or even nested map."
type: "object"
version:
description: "Version of the component. This field will be used to version\
\ the executable and the Docker container."
type: "string"
links:
description: "External links of the component."
$ref: "defs_viash.yaml#/definitions/Links"
references:
description: "References to external resources related to the component."
$ref: "defs_viash.yaml#/definitions/References"
engines:
description: "A list of engine environments to execute target artifacts in.\n\
\n - NativeEngine\n - DockerEngine\n"
type: "array"
items:
$ref: "defs_viash.yaml#/definitions/Engine"
resources:
description: "Resources are files that support the component. The first resource\
\ should be a script that will be executed when the component is run. Additional\
\ resources will be copied to the same directory.\n\nCommon properties:\n\
\n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\
\ / `scala_script` / `csharp_script`, specifies the type of the resource.\
\ The first resource cannot be of type `file`. When the type is not specified,\
\ the default type is simply `file`.\n * dest: filename, the resulting name\
\ of the resource. From within a script, the file can be accessed at `meta[\"\
resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\
\ the basename of the `path` parameter.\n * path: `path/to/file`, the path\
\ of the input file. Can be a relative or an absolute path, or a URI. Mutually\
\ exclusive with `text`.\n * text: ...multiline text..., the content of\
\ the resulting file specified as a string. Mutually exclusive with `path`.\n\
\ * is_executable: `true` / `false`, whether the resulting resource file\
\ should be made executable.\n"
type: "array"
items:
$ref: "defs_viash.yaml#/definitions/Resource"
keywords:
description: "The keywords of the components."
type: "array"
items:
type: "string"
test_resources:
description: "One or more scripts to be used to test the component behaviour\
\ when `viash test` is invoked. Additional files of type `file` will be\
\ made available only during testing. Each test script should expect no\
\ command-line inputs, be platform-independent, and return an exit code\
\ >0 when unexpected behaviour occurs during testing. See Unit Testing for\
\ more info."
type: "array"
items:
$ref: "defs_viash.yaml#/definitions/Resource"
namespace:
description: "Namespace this component is a part of. See the Namespaces guide\
\ for more information on namespaces."
type: "string"
arguments:
description: "A list of arguments for this component. For each argument, a\
\ type and a name must be specified. Depending on the type of argument,\
\ different properties can be set. See these reference pages per type for\
\ more information: \n\n - string\n - file\n - integer\n - double\n - boolean\n\
\ - boolean_true\n - boolean_false\n"
type: "array"
items:
$ref: "defs_viash.yaml#/definitions/Argument"
__merge__:
$ref: "#/definitions/Merge"
required:
- "name"
additionalProperties: false
Merge:
type: string
description: Path to a YAML file to inherit values from.
oneOf:
- type: array
- type: string
FileFormat:
description: 'File format metadata'
type: object
required: [label, file_format]
properties:
label:
$ref: "defs_common.yaml#/definitions/Label"
summary:
$ref: "defs_common.yaml#/definitions/Summary"
file_format:
oneOf:
- type: object
required: [type]
additionalProperties: false
properties:
type:
const: h5ad
X:
$ref: "#/definitions/AnnDataSlot"
layers:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
var:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
varm:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
varp:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
obs:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
obsm:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
obsp:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
uns:
type: array
items:
oneOf:
- $ref: "#/definitions/AnnDataSlot"
- $ref: "#/definitions/AnnDataSlotObject"
- type: object
required: [type, mod]
additionalProperties: false
properties:
type:
const: h5mu
mod:
type: object
additionalProperties: false
properties:
rna:
"#/definitions/AnnData"
atac:
"#/definitions/AnnData"
prot:
"#/definitions/AnnData"
vdj:
"#/definitions/AnnData"
vdj_t:
"#/definitions/AnnData"
vdj_b:
"#/definitions/AnnData"
gdo:
"#/definitions/AnnData"
hto:
"#/definitions/AnnData"
MuData:
required: [mod]
additionalProperties: false
properties:
mod:
type: object
additionalProperties: false
properties:
rna:
"#/definitions/AnnData"
atac:
"#/definitions/AnnData"
prot:
"#/definitions/AnnData"
vdj:
"#/definitions/AnnData"
vdj_t:
"#/definitions/AnnData"
vdj_b:
"#/definitions/AnnData"
gdo:
"#/definitions/AnnData"
hto:
"#/definitions/AnnData"
AnnData:
additionalProperties: false
properties:
X:
$ref: "#/definitions/AnnDataSlot"
layers:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
var:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
varm:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
varp:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
obs:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
obsm:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
obsp:
type: array
items:
$ref: "#/definitions/AnnDataSlot"
uns:
type: array
items:
oneOf:
- $ref: "#/definitions/AnnDataSlot"
- $ref: "#/definitions/AnnDataSlotObject"
AnnDataSlot:
properties:
type:
enum: [integer, double, string, boolean]
name:
type: string
description: A unique identifier.
pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$"
description:
type: string
required:
type: boolean
required: [type, name, description, required]
AnnDataSlotObject:
properties:
type:
enum: [object]
name:
type: string
description: A unique identifier.
pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$"
description:
type: string
required:
type: boolean
required: [type, name, description, required]
# added specific properties to the author info
Author:
description: Author metadata.
type: object
additionalProperties: false
properties:
name:
description: Full name of the author, usually in the name of FirstName MiddleName LastName.
type: string
info:
description: Additional information on the author
type: object
additionalProperties: false
required: [role, links, organizations]
properties:
links:
type: object
additionalProperties: false
required: github
properties:
github:
type: string
orcid:
type: string
email:
type: string
twitter:
type: string
linkedin:
type: string
role:
description: Role in the organisation
enum: ["Core Team Member", "Contributor"]
organizations:
type: array
minItems: 1
items:
type: object
additionalProperties: false
required: [name, href, role]
properties:
name:
type: string
href:
type: string
role:
type: string

3135
schemas/defs_viash.yaml Normal file

File diff suppressed because it is too large Load Diff

56
schemas/file_format.yaml Normal file
View File

@@ -0,0 +1,56 @@
title: File API
description: A file format specification file.
type: "object"
properties:
info:
$ref: "defs_common.yaml#/definitions/FileFormat"
create_parent:
description: "If the output filename is a path and it does not exist, create\
\ it before executing the script (only for `direction: output`)."
type: "boolean"
default:
anyOf:
- description: "The default value when no argument value is provided. This\
\ will not work if the [`required`](#required) property is enabled."
type: "string"
- description: "The default value when no argument value is provided. This\
\ will not work if the [`required`](#required) property is enabled."
type: "array"
items:
type: "string"
example:
anyOf:
- description: "An example value for this argument. If no [`default`](#default)\
\ property was specified, this will be used for that purpose."
type: "string"
- description: "An example value for this argument. If no [`default`](#default)\
\ property was specified, this will be used for that purpose."
type: "array"
items:
type: "string"
description:
description: "A description of the argument. This will be displayed with `--help`."
type: "string"
multiple_sep:
description: "The delimiter character for providing [`multiple`](#multiple)\
\ values. `:` by default."
type: "string"
multiple:
description: "Treat the argument value as an array. Arrays can be passed using\
\ the delimiter `--foo=1:2:3` or by providing the same argument multiple\
\ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\
\ property. `false` by default."
type: "boolean"
type:
description: "A `file` type argument has a string value that points to a file\
\ or folder path."
const: "file"
required:
description: "Make the value for this argument required. If set to `true`,\
\ an error will be produced if no value was provided. `false` by default."
type: "boolean"
__merge__:
$ref: "defs_common.yaml#/definitions/Merge"
required: [type, info]
additionalProperties: false

View File

@@ -0,0 +1 @@
$ref: "defs_viash.yaml#/definitions/PackageConfig"

View File

@@ -0,0 +1,2 @@
oneOf:
- $ref: "defs_common.yaml#/definitions/Config"

View File

@@ -0,0 +1,150 @@
name: celltypist
namespace: annotate
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data to be used for cell type annotation if .X is not to be used.
- name: "--var_query_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
required: false
- name: "--reference_obs_target"
type: string
description: The name of the adata obs column in the reference data containing cell type annotations.
default: "cell_ontology_class"
- name: "--check_expression"
type: boolean_true
description: |
Whether to check the expression of the reference dataset to the format reccomended by CellTypist.
CellTypist requires data to be log-normalized to 10000 counts per cell.
- name: "--var_reference_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--feature_selection"
type: boolean
description: "Whether to perform feature selection."
default: false
- name: "--majority_voting"
type: boolean
description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
default: false
- name: "--C"
type: double
description: "Inverse of regularization strength in logistic regression."
default: 1.0
- name: "--max_iter"
type: integer
description: "Maximum number of iterations before reaching the minimum of the cost function."
default: 1000
- name: "--use_SGD"
type: boolean_true
description: "Whether to use the stochastic gradient descent algorithm."
- name: "--min_prop"
type: double
description: |
"For the dominant cell type within a subcluster, the minimum proportion of cells required to
support naming of the subcluster by this cell type. Ignored if majority_voting is set to False.
Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
default: 0
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_compression"
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"
- name: "--output_obs_predictions"
type: string
default: celltypist_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: celltypist_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: python:3.10-slim
setup:
- type: apt
packages:
- libhdf5-dev
- procps
- type: python
__merge__: [ /src/base/requirements/scanpy.yaml, .]
- type: python
packages:
- celltypist==1.6.3
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow

View File

@@ -0,0 +1,115 @@
import sys
import logging
import celltypist
import mudata as mu
import re
import numpy as np
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_log_normalized.h5mu",
"output": "output.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
"model": None,
"reference_obs_target": "cell_ontology_class",
"check_expression": False,
"feature_selection": True,
"majority_voting": True,
"output_compression": "gzip",
"var_query_gene_names": None,
"var_reference_gene_names": "ensemblid",
"input_layer": None,
"reference_layer": None,
"output_obs_predictions": "celltypist_pred",
"output_obs_probabilities": "celltypist_probability",
}
meta = {
}
## VIASH END
# START TEMPORARY WORKAROUND setup_logger
def setup_logger():
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(sys.stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger
# END TEMPORARY WORKAROUND setup_logger
def check_celltypist_format(indata):
if np.abs(np.expm1(indata[0]).sum()-10000) > 1:
return False
return True
def set_var_index(adata, var_name):
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
return adata
def main(par):
if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]):
raise ValueError("Make sure to provide either 'model' or 'reference', but not both.")
logger = setup_logger()
input_mudata = mu.read_h5mu(par["input"])
input_modality = input_mudata.mod[par["modality"]].copy()
# Set var names to the desired gene name format (gene synbol, ensembl id, etc.)
# CellTypist requires query gene names to be in the same format as the reference data.
input_modality = set_var_index(input_modality, par["var_query_gene_names"]) if par["var_query_gene_names"] else input_modality
if par["model"]:
logger.info("Loading CellTypist model")
model = celltypist.models.Model.load(par["model"])
elif par["reference"]:
reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]]
if par["var_reference_gene_names"]:
reference_modality = set_var_index(reference_modality, par["var_reference_gene_names"])
logger.info("Detecting common vars")
common_ens_ids = reference_modality.var.index.intersection(input_modality.var.index)
logger.info(" reference n_vars: %i", reference_modality.n_vars)
logger.info(" input n_vars: %i", input_modality.n_vars)
logger.info(" intersect n_vars: %i", len(common_ens_ids))
assert len(common_ens_ids) >= 100, "The intersection of genes is too small."
input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X
reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X
if not check_celltypist_format(input_matrix):
logger.warning("Input data is not in the reccommended format for CellTypist.")
if not check_celltypist_format(reference_matrix):
logger.warning("Reference data is not in the reccommended format for CellTypist.")
labels = reference_modality.obs[par["reference_obs_target"]]
logger.info("Training CellTypist model on reference")
model = celltypist.train(reference_matrix,
labels=labels,
genes=reference_modality.var.index,
C=par["C"],
max_iter=par["max_iter"],
use_SGD=par["use_SGD"],
feature_selection=par["feature_selection"],
check_expression=par["check_expression"])
logger.info("Predicting CellTypist annotations")
predictions = celltypist.annotate(input_modality,
model,
majority_voting=par["majority_voting"])
input_modality.obs[par["output_obs_predictions"]] = predictions.predicted_labels["predicted_labels"]
input_modality.obs[par["output_obs_probability"]] = predictions.probability_matrix.max(axis=1).values
input_mudata.mod[par["modality"]] = input_modality
input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == '__main__':
main(par)

View File

@@ -0,0 +1,148 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
import scanpy as sc
import anndata as ad
from openpipelinetestutils.asserters import assert_annotation_objects_equal
## VIASH START
meta = {
"resources_dir": "resources_test"
}
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
model_file = f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl"
celltypist_input_file = f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu"
@pytest.fixture
def normalize_log_transform(random_h5mu_path):
def wrapper(input_mudata_file, modality, target_sum=1e4):
input_mudata = mu.read_h5mu(input_mudata_file)
input_adata = input_mudata.mod[modality]
adata = input_adata.copy()
input_layer = adata.X
data_for_scanpy = ad.AnnData(X=input_layer.copy())
sc.pp.normalize_total(data_for_scanpy, target_sum=target_sum)
sc.pp.log1p(data_for_scanpy,
base=None,
layer=None, # use X
copy=False) # allow overwrites in the copy that was made
adata.X = data_for_scanpy.X
adata.uns['log1p'] = data_for_scanpy.uns['log1p'].copy()
input_mudata.mod[modality] = adata
transformed_input_mudata_file = random_h5mu_path()
input_mudata.write_h5mu(transformed_input_mudata_file)
return transformed_input_mudata_file
return wrapper
def test_simple_execution(run_component, random_h5mu_path, normalize_log_transform):
output_file = random_h5mu_path()
input_file_transformed = normalize_log_transform(input_file, "rna")
run_component([
"--input", input_file_transformed,
"--reference", reference_file,
"--reference_obs_targets", "cell_ontology_class",
"--var_reference_gene_names", "ensemblid",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file_transformed)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs"
obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]"
def test_set_params(run_component, random_h5mu_path, normalize_log_transform):
output_file = random_h5mu_path()
input_file_transformed = normalize_log_transform(input_file, "rna")
run_component([
"--input", input_file_transformed,
"--reference", reference_file,
"--reference_obs_target", "cell_ontology_class",
"--var_reference_gene_names", "ensemblid",
"--feature_selection", "True",
"--majority_voting", "True",
"--C", "0.5",
"--max_iter", "100",
"--use_SGD",
"--min_prop", "0.1",
"--input_layer", "log_normalized",
"--output", output_file,
"--output_compression", "gzip",
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file_transformed)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs"
obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]"
def test_with_model(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component([
"--input", celltypist_input_file,
"--model", model_file,
"--reference_obs_targets", "cell_type",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
output_mudata = mu.read_h5mu(output_file)
assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs"
obs_values = output_mudata.mod["rna"].obs["celltypist_probability"]
assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]"
def test_fail_check_reference_expression(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", input_file,
"--reference", reference_file,
"--var_reference_gene_names", "ensemblid",
"--output", output_file,
"--check_expression"
])
assert re.search(r"Invalid expression matrix, expect log1p normalized expression to 10000 counts per cell",
err.value.stdout.decode('utf-8'))
def test_fail_invalid_input_expression(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", input_file,
"--reference", reference_file,
"--var_reference_gene_names", "ensemblid",
"--output", output_file
])
assert re.search(r"Invalid expression matrix in `.X`, expect log1p normalized expression to 10000 counts per cell",
err.value.stdout.decode('utf-8'))
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,136 @@
name: onclass
namespace: annotate
description: |
OnClass is a python package for single-cell cell type annotation. It uses the Cell Ontology to capture the cell type similarity.
These similarities enable OnClass to annotate cell types that are never seen in the training data.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data to be used for cell type annotation if .X is not to be used.
required: false
- name: "--cl_nlp_emb_file"
type: file
description: The .nlp.emb file with the cell type embeddings.
required: true
- name: "--cl_ontology_file"
type: file
description: The .ontology file with the cell type ontology.
required: true
- name: "--cl_obo_file"
type: file
description: The .obo file with the cell type ontology.
required: true
- name: "--var_query_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used.
required: false
- name: "--reference_obs_target"
type: string
description: The name of the adata obs column in the reference data containing cell type annotations.
example: "cell_ontology_class"
required: true
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_compression"
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"
- name: "--output_obs_predictions"
type: string
default: onclass_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: onclass_prob
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
- name: Model arguments
description: Model arguments
arguments:
- name: "--model"
type: string
description: |
"Pretrained model path without a file extension. If not provided, the model will be trained
on the reference data and --reference should be provided. The path namespace should contain:
- a .npz or .pkl file
- a .data file
- a .meta file
- a .index file
e.g. /path/to/model/pretrained_model_target1 as saved by OnClass."
required: false
direction: input
- name: "--max_iter"
type: integer
default: 30
required: false
description: Maximum number of iterations for training the model.
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: python:3.8
setup:
- type: python
packages:
- scikit-learn==0.24.0
- OnClass==1.2
- tensorflow==2.13.1
- obonet==1.1.0
- mudata
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow

View File

@@ -0,0 +1,196 @@
import sys
import logging
import mudata as mu
import anndata as ad
import re
import numpy as np
from OnClass.OnClassModel import OnClassModel
import obonet
from typing import Dict, Tuple
from tqdm import tqdm
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
"output": "output.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
"model": None,
"reference_obs_targets": "cell_ontology_class",
"input_layer": None,
"reference_layer": None,
"max_iter": 100,
"output_obs_predictions": None,
"output_obs_probability": None,
"cl_nlp_emb_file": "resources_test/annotation_test_data/ontology/cl.ontology.nlp.emb",
"cl_ontology_file": "resources_test/annotation_test_data/ontology/cl.ontology",
"cl_obo_file": "resources_test/annotation_test_data/ontology/cl.obo",
"output_compression": "gzip"
}
meta = {"resources_dir": "src/annotate/onclass"}
## VIASH END
sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
def setup_logger():
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(sys.stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()
def map_celltype_to_ontology_id(cl_obo_file: str) -> Tuple[Dict[str, str], Dict[str, str]]:
"""
Map cell type names to ontology IDs and vice versa.
Parameters
----------
cl_obo_file : str
Path to the cell ontology file.
Returns
-------
Tuple[Dict[str, str], Dict[str, str]]
A tuple of two dictionaries. The first dictionary maps cell ontology IDs to cell type names.
The second dictionary maps cell type names to cell ontology IDs.
"""
graph = obonet.read_obo(cl_obo_file)
cl_id_to_name = {id_: data.get("name") for id_, data in graph.nodes(data=True)}
cl_id_to_name = {k: v for k, v in cl_id_to_name.items() if v is not None}
name_to_cl_id = {v: k for k, v in cl_id_to_name.items()}
return cl_id_to_name, name_to_cl_id
def predict_input_data(model: OnClassModel,
input_matrix: np.array,
input_modality: ad.AnnData,
id_to_name: dict,
obs_prediction: str,
obs_probability: str) -> ad.AnnData:
"""
Predict cell types for input data and save results to Anndata obj.
Parameters
----------
model : OnClassModel
The OnClass model.
input_matrix : np.array
The input data matrix.
input_modality : ad.AnnData
The input data Anndata object.
id_to_name : dict
Dictionary mapping cell ontology IDs to cell type names.
obs_prediction : str
The obs key for the predicted cell type.
obs_probability : str
The obs key for the predicted cell type probability.
Returns
-------
ad.AnnData
The input data Anndata object with the predicted cell types saved in obs.
"""
corr_test_feature = model.ProcessTestFeature(
test_feature=input_matrix,
test_genes=input_modality.var_names,
log_transform=False,
)
onclass_pred = model.Predict(corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0)
pred_label = [model.i2co[ind] for ind in onclass_pred[2]]
pred_cell_type_label = [id_to_name[id] for id in pred_label]
input_modality.obs[obs_prediction] = pred_cell_type_label
input_modality.obs[obs_probability] = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1)
return input_modality
def set_var_index(adata, var_name):
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
return adata
def main():
if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]):
raise ValueError("Make sure to provide either 'model' or 'reference', but not both.")
logger.info("Reading input data")
input_mudata = mu.read_h5mu(par["input"])
input_modality = input_mudata.mod[par["modality"]].copy()
# Set var names to the desired gene name format (gene synbol, ensembl id, etc.)
input_modality = set_var_index(input_modality, par["var_query_gene_names"]) if par["var_query_gene_names"] else input_modality
input_matrix = input_modality.layers[par["input_layer"]].toarray() if par["input_layer"] else input_modality.X.toarray()
id_to_name, name_to_id = map_celltype_to_ontology_id(par["cl_obo_file"])
if par["model"]:
logger.info("Predicting cell types using pre-trained model")
model = OnClassModel(cell_type_nlp_emb_file=par["cl_nlp_emb_file"],
cell_type_network_file=par["cl_ontology_file"])
model.BuildModel(use_pretrain=par["model"], ngene=None)
elif par["reference"]:
logger.info("Reading reference data")
model = OnClassModel(cell_type_nlp_emb_file=par["cl_nlp_emb_file"],
cell_type_network_file=par["cl_ontology_file"])
reference_mudata = mu.read_h5mu(par["reference"])
reference_modality = reference_mudata.mod[par["modality"]].copy()
reference_modality.var["gene_symbol"] = list(reference_modality.var.index)
reference_modality.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_modality.var["ensemblid"]]
logger.info("Detecting common vars based on ensembl ids")
common_ens_ids = list(set(reference_modality.var.index).intersection(set(input_modality.var.index)))
logger.info(" reference n_vars: %i", reference_modality.n_vars)
logger.info(" input n_vars: %i", input_modality.n_vars)
logger.info(" intersect n_vars: %i", len(common_ens_ids))
assert len(common_ens_ids) >= 100, "The intersection of genes is too small."
reference_matrix = reference_modality.layers[par["reference_layer"]].toarray() if par["reference_layer"] else reference_modality.X.toarray()
logger.info("Training a model from reference...")
labels = reference_modality.obs[par["reference_obs_target"]].tolist()
labels_cl = [name_to_id[label] for label in labels]
_ = model.EmbedCellTypes(labels_cl)
(
corr_train_feature,
_,
corr_train_genes,
_,
) = model.ProcessTrainFeature(
train_feature=reference_matrix,
train_label=labels_cl,
train_genes=reference_modality.var_names,
test_feature=input_matrix,
test_genes=input_modality.var_names,
log_transform=False,
)
model.BuildModel(ngene=len(corr_train_genes))
model.Train(corr_train_feature,
labels_cl,
max_iter=par["max_iter"])
logger.info(f"Predicting cell types")
input_modality = predict_input_data(model,
input_matrix,
input_modality,
id_to_name,
par["output_obs_predictions"],
par["output_obs_probability"])
logger.info("Writing output data")
input_mudata.mod[par["modality"]] = input_modality
input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,146 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
import anndata as ad
from openpipelinetestutils.asserters import assert_annotation_objects_equal
import os
## VIASH START
meta = {
"resources_dir": "resources_test"
}
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
cl_nlp_emb_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology.nlp.emb"
cl_ontology_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology"
cl_obo_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.obo"
model_file = f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model"
@pytest.fixture
def swap_gene_symbol(random_h5mu_path):
def wrapper(input_mudata_file, modality):
input_mudata = mu.read_h5mu(input_mudata_file)
input_adata = input_mudata.mod[modality]
adata = input_adata.copy()
adata.var["ensemblid"] = list(adata.var.index)
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var["gene_symbol"]]
input_mudata.mod[modality] = adata
swapped_input_mudata_file = random_h5mu_path()
input_mudata.write_h5mu(swapped_input_mudata_file)
return swapped_input_mudata_file
return wrapper
def test_simple_execution(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component([
"--input", input_file,
"--reference", reference_file,
"--reference_obs_target", "cell_ontology_class",
"--cl_nlp_emb_file", cl_nlp_emb_file,
"--cl_ontology_file", cl_ontology_file,
"--cl_obo_file", cl_obo_file,
"--max_iter", "10",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ['onclass_pred',
'onclass_prob']
obs_values = output_mudata.mod["rna"].obs["onclass_prob"]
assert all(0 <= value <= 1 for value in obs_values), ".obs at cell_ontology_class_prob has values outside the range [0, 1]"
def test_custom_obs(run_component, random_h5mu_path):
output_file = random_h5mu_path()
run_component([
"--input", input_file,
"--reference", reference_file,
"--reference_obs_target", "cell_ontology_class",
"--output_obs_predictions", "dummy_pred_1",
"--output_obs_probability", "dummy_prob_1",
"--cl_nlp_emb_file", cl_nlp_emb_file,
"--cl_ontology_file", cl_ontology_file,
"--cl_obo_file", cl_obo_file,
"--max_iter", "10",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert set(output_mudata.mod["rna"].obs.keys()) == {'dummy_pred_1', 'dummy_prob_1'}
obs_keys = ['dummy_prob_1']
for key in obs_keys:
obs_values = output_mudata.mod["rna"].obs[key]
assert all(0 <= value <= 1 for value in obs_values), f".obs at {key} has values outside the range [0, 1]"
def test_no_model_no_reference_error(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", input_file,
"--output", output_file,
"--cl_nlp_emb_file", cl_nlp_emb_file,
"--cl_ontology_file", cl_ontology_file,
"--cl_obo_file", cl_obo_file,
"--reference_obs_target", "cell_ontology_class"
])
assert re.search(r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
err.value.stdout.decode('utf-8'))
def test_pretrained_model(run_component, random_h5mu_path, swap_gene_symbol):
output_file = random_h5mu_path()
swapped_input_file = swap_gene_symbol(input_file, "rna")
run_component([
"--input", swapped_input_file,
"--cl_nlp_emb_file", cl_nlp_emb_file,
"--cl_ontology_file", cl_ontology_file,
"--cl_obo_file", cl_obo_file,
"--reference_obs_target", "cell_ontology_class",
"--model", model_file,
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ['onclass_pred',
'onclass_prob']
obs_values = output_mudata.mod["rna"].obs["onclass_prob"]
assert all(0 <= value <= 1 for value in obs_values), ".obs at cell_ontology_class_prob has values outside the range [0, 1]"
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,165 @@
name: popv
namespace: "annotate"
description: "Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV."
authors:
- __merge__: /src/authors/matthias_beyens.yaml
roles: [ author ]
- __merge__: /src/authors/robrecht_cannoodt.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Arguments related to the input (aka query) dataset.
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: Input h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`.
required: false
- name: "--input_obs_batch"
type: string
description: Key in obs field of input adata for batch information. If no value is provided, batch label is assumed to be unknown.
required: false
- name: "--input_var_subset"
type: string
description: Subset the input object with this column.
required: false
- name: "--input_obs_label"
type: string
description: Key in obs field of input adata for label information. This is only used for training scANVI. Unlabelled cells should be set to `"unknown_celltype_label"`.
required: false
- name: "--unknown_celltype_label"
type: string
description: If `input_obs_label` is specified, cells with this value will be treated as unknown and will be predicted by the model.
default: "unknown"
required: false
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "User-provided reference tissue. The data that will be used as reference to call cell types."
example: TS_Bladder_filtered.h5ad
direction: input
required: true
- name: "--reference_layer"
type: string
description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`.
required: false
- name: "--reference_obs_label"
type: string
description: Key in obs field of reference AnnData with cell-type information.
default: "cell_ontology_class"
required: false
- name: "--reference_obs_batch"
type: string
description: Key in obs field of input adata for batch information.
default: "donor_assay"
required: false
# - name: "--reference_models"
# type: file
# description: Pretrained models. Can be a directory or a tar gz.
# required: false
# example: pretrained_models_Bladder_ts.tar.gz
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
required: true
example: output.h5mu
- name: "--output_compression"
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"
# - name: "--output_models"
# type: file
# direction: output
# description: If `prediction_mode == "retrain"`, saves models to a directory and compresses the results into a tar gz.
# example: "output.tar.gz"
# required: false
- name: Arguments
description: Other arguments.
arguments:
- name: "--methods"
type: string
description: "Methods to call cell types. By default, runs to knn_on_scvi and scanvi."
example: ["knn_on_scvi", "scanvi"]
choices: [celltypist, knn_on_bbknn, knn_on_scanorama, knn_on_scvi, onclass, rf, scanvi, svm]
required: true
multiple: true
# - name: "--prediction_mode"
# type: string
# description: |
# Execution mode of cell-type annotation.
# "retrain": Train all prediction models and saves them to disk. Argument `output_models` must be defined.
# "inference": Classify all cells based on pretrained models. Argument `reference_models` must be defined.
# "fast": Fast inference using only query cells and single epoch in scArches.
# - name: "--plots"
# type: boolean
# description: "Creation of agreement and frequency plots between selected cell type algorithmn(s) and final PopV ensemble called cell type."
# default: false
# required: false
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
#image: nvcr.io/nvidia/pytorch:22.12-py3
image: python:3.9-slim
setup:
- type: apt
packages:
- procps
- git
- build-essential
- wget
- type: python
__merge__: [ /src/base/requirements/scanpy.yaml, .]
packages:
- scvi-tools~=1.0.3
- popv~=0.3.2
- jax==0.4.10
- jaxlib==0.4.10
- ml-dtypes<0.3.0
- scipy==1.12.0
# These need to be updated AFTER popv is installed.
# See https://github.com/YosefLab/PopV/issues/30
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
# download ontology required by popv
- type: docker
run: |
cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git && \
cd PopV && git fetch --depth 1 origin tag v0.2 && git checkout v0.2
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
# TODO: should add new label highmem-single-gpu and lowmem-single-gpu
label: [highmem, highcpu]

223
src/annotate/popv/script.py Normal file
View File

@@ -0,0 +1,223 @@
import sys
import re
import tempfile
import typing
import numpy as np
import mudata as mu
import anndata as ad
import popv
# todo: is this still needed?
from torch.cuda import is_available as cuda_is_available
try:
from torch.backends.mps import is_available as mps_is_available
except ModuleNotFoundError:
# Older pytorch versions
# MacOS GPUs
def mps_is_available():
return False
# where to find the obo files
cl_obo_folder = "/opt/PopV/ontology/"
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
# "input": "resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/tmp_TS_Blood_filtered.h5ad",
"input_obs_batch": None,
"input_layer": None,
"input_obs_label": None,
"input_var_subset": None,
"unknown_celltype_label": "unknown",
"reference_layer": None,
"reference_obs_label": "cell_ontology_class",
"reference_obs_batch": "donor_assay",
"output": "output.h5mu",
"output_compression": "gzip",
"methods": [
# "celltypist",
# "knn_on_bbknn",
# "knn_on_scanorama",
# "knn_on_scvi",
"rf",
# "scanvi",
"svm",
]
}
meta = {}
# for debugging the obo folder can be somewhere local
cl_obo_folder = "popv_cl_ontology/"
## VIASH END
sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()
use_gpu = cuda_is_available() or mps_is_available()
logger.info("GPU enabled? %s", use_gpu)
# Helper functions
def get_X(adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str]):
"""Fetch the counts data from X or a layer. Subset columns by var_index if so desired."""
if var_index:
adata = adata[:, var_index]
if layer:
return adata.layers[layer]
else:
return adata.X
def get_obs(adata: ad.AnnData, obs_par_names):
"""Subset the obs dataframe to just the columns defined by the obs_label and obs_batch."""
obs_columns = [par[x] for x in obs_par_names if par[x]]
return adata.obs[obs_columns]
def get_var(adata: ad.AnnData, var_index: list[str]):
"""Fetch the var dataframe. Subset rows by var_index if so desired."""
return adata.var.loc[var_index]
def main(par, meta):
assert len(par["methods"]) >= 1, "Please, specify at least one method for cell typing."
logger.info("Cell typing methods: {}".format(par["methods"]))
### PREPROCESSING REFERENCE ###
logger.info("### PREPROCESSING REFERENCE ###")
# take a look at reference data
logger.info("Reading reference data '%s'", par["reference"])
reference = ad.read_h5ad(par["reference"])
logger.info("Setting reference var index to Ensembl IDs")
reference.var["gene_symbol"] = list(reference.var.index)
reference.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference.var["ensemblid"]]
logger.info("Detect number of samples per label")
min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size())
n_samples_per_label = np.max((min_celltype_size, 100))
### PREPROCESSING INPUT ###
logger.info("### PREPROCESSING INPUT ###")
logger.info("Reading '%s'", par["input"])
input = mu.read_h5mu(par["input"])
input_modality = input.mod[par["modality"]]
# subset with var column
if par["input_var_subset"]:
logger.info("Subset input with .var['%s']", par["input_var_subset"])
assert par["input_var_subset"] in input_modality.var, f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var"
input_modality = input_modality[:,input_modality.var[par["input_var_subset"]]]
### ALIGN REFERENCE AND INPUT ###
logger.info("### ALIGN REFERENCE AND INPUT ###")
logger.info("Detecting common vars based on ensembl ids")
common_ens_ids = list(set(reference.var.index).intersection(set(input_modality.var.index)))
logger.info(" reference n_vars: %i", reference.n_vars)
logger.info(" input n_vars: %i", input_modality.n_vars)
logger.info(" intersect n_vars: %i", len(common_ens_ids))
assert len(common_ens_ids) >= 100, "The intersection of genes is too small."
# subset input objects to make sure popv is using the data we expect
input_modality = ad.AnnData(
X = get_X(input_modality, par["input_layer"], common_ens_ids),
obs = get_obs(input_modality, ["input_obs_label", "input_obs_batch"]),
var = get_var(input_modality, common_ens_ids)
)
reference = ad.AnnData(
X = get_X(reference, par["reference_layer"], common_ens_ids),
obs = get_obs(reference, ["reference_obs_label", "reference_obs_batch"]),
var = get_var(reference, common_ens_ids)
)
# remove layers that
### ALIGN REFERENCE AND INPUT ###
logger.info("### ALIGN REFERENCE AND INPUT ###")
with tempfile.TemporaryDirectory(prefix="popv-", dir=meta["temp_dir"]) as temp_dir:
logger.info("Run PopV processing")
pq = popv.preprocessing.Process_Query(
# input
query_adata=input_modality,
query_labels_key=par["input_obs_label"],
query_batch_key=par["input_obs_batch"],
query_layers_key=None, # this is taken care of by subset
# reference
ref_adata=reference,
ref_labels_key=par["reference_obs_label"],
ref_batch_key=par["reference_obs_batch"],
# options
unknown_celltype_label=par["unknown_celltype_label"],
n_samples_per_label=n_samples_per_label,
# pretrained model
# Might need to be parameterized at some point
prediction_mode="retrain",
pretrained_scvi_path=None,
# outputs
# Might need to be parameterized at some point
save_path_trained_models=temp_dir,
# hardcoded values
cl_obo_folder=cl_obo_folder,
use_gpu=use_gpu
)
method_kwargs = {}
if 'scanorama' in par['methods']:
method_kwargs['scanorama'] = {'approx': False}
logger.info("Annotate data")
popv.annotation.annotate_data(
adata=pq.adata,
methods=par["methods"],
methods_kwargs=method_kwargs
)
popv_input = pq.adata[input_modality.obs_names]
# select columns starting with "popv_"
popv_obs_cols = popv_input.obs.columns[popv_input.obs.columns.str.startswith("popv_")]
# create new data frame with selected columns
df_popv = popv_input.obs[popv_obs_cols]
# remove prefix from column names
df_popv.columns = df_popv.columns.str.replace("popv_", "")
# store output in mudata .obsm
input.mod[par["modality"]].obsm["popv_output"] = df_popv
# copy important output in mudata .obs
for col in ["popv_prediction"]:
if col in popv_input.obs.columns:
input.mod[par["modality"]].obs[col] = popv_input.obs[col]
# code to explore how the output differs from the original
# for attr in ["obs", "var", "uns", "obsm", "layers", "obsp"]:
# old_keys = set(getattr(pq_adata_orig, attr).keys())
# new_keys = set(getattr(pq.adata, attr).keys())
# diff_keys = list(new_keys.difference(old_keys))
# diff_keys.sort()
# print(f"{attr}:", flush=True)
# for key in diff_keys:
# print(f" {key}", flush=True)
# write output
logger.info("Writing %s", par["output"])
input.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main(par, meta)

73
src/annotate/popv/test.py Normal file
View File

@@ -0,0 +1,73 @@
import sys
import os
import pytest
import mudata as mu
## VIASH START
meta = {
"resources_dir": "resources_test"
}
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad"
def test_simple_execution(run_component):
output_file = "output.h5mu"
run_component([
"--input", input_file,
"--reference", reference_file,
"--output", "output.h5mu",
"--methods", "rf;svm"
])
# check whether file exists
assert os.path.exists(output_file), "Output file does not exist"
# read output mudata
output = mu.read_h5mu(output_file)
# check output
expected_rna_obs_cols = ["popv_prediction"]
for col in expected_rna_obs_cols:
assert col in output.mod["rna"].obs.columns, f"could not find columns .mod['rna'].obs['{col}']"
print(f"output: {output}", flush=True)
def test_popv_with_other_layer(run_component, tmp_path):
input_h5mu = mu.read(input_file)
input_h5mu.mod['rna'].layers['test'] = input_h5mu.mod['rna'].X.copy()
input_h5mu.write_h5mu(tmp_path / "input.h5mu")
run_component([
"--input", tmp_path / "input.h5mu",
"--reference", reference_file,
"--output", "output.h5mu",
"--methods", "rf;svm;knn_on_scanorama"
])
def test_popv_with_non_overlapping_cells(run_component, tmp_path):
input_h5mu = mu.read(input_file)
# copy previous modalities
rna_ad = input_h5mu.mod["rna"].copy()
prot_ad = input_h5mu.mod["prot"].copy()
# change obs_names such that the cells do not overlap
rna_ad.obs_names = [f"rna_{x}" for x in rna_ad.obs_names]
prot_ad.obs_names = [f"prot_{x}" for x in prot_ad.obs_names]
# write new h5mu to file
new_h5mu = mu.MuData({"rna": rna_ad, "prot": prot_ad})
new_h5mu.write_h5mu(tmp_path / "input.h5mu")
# run component
run_component([
"--input", tmp_path / "input.h5mu",
"--reference", reference_file,
"--output", "output.h5mu",
"--methods", "rf;svm;knn_on_scanorama"
])
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,140 @@
name: random_forest_annotation
namespace: annotate
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data to be used for cell type annotation if .X is not to be used.
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
required: false
- name: "--reference_obs_target"
type: string
description: Key in obs field of reference modality with cell-type information.
required: true
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_compression"
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"
- name: "--output_obs_predictions"
type: string
default: random_forest_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: random_forest_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--n_estimators"
type: integer
required: false
default: 100
description: Number of trees in the random forest.
- name: "--max_depth"
type: integer
required: false
description: |
Maximum depth of the trees in the random forest.
If not provided, the nodes are expanded until all leaves only contain a single sample.
- name: "--criterion"
type: string
required: false
choices: ["gini", "entropy", "log_loss"]
default: "gini"
description: The function to measure the quality of a split.
- name: "--class_weight"
type: string
required: false
default: "balanced_subsample"
choices: ["balanced", "balanced_subsample", "uniform"]
description: |
Weights associated with classes.
The `balanced` mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data.
The `balanced_subsample` mode is the same as `balanced` except that weights are computed based on the bootstrap sample for every tree grown.
The `uniform` mode gives all classes a weight of one.
- name: "--max_features"
type: string
default: "200"
description: |
The number of features to consider when looking for the best split. The value can either be a positive integer or one of `sqrt`, `log2` or `all`.
If integer: consider max_features features at each split.
If `sqrt`: max_features is the squareroot of all input features.
If `log2`: max_features is the log2 of all input features.
If `all`: max features equals all input features.
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- libhdf5-dev
- procps
- type: python
packages:
- scikit-learn==1.4.2
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow

View File

@@ -0,0 +1,103 @@
import sys
import logging
import mudata as mu
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
"output": "output.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
"model": None,
"reference_obs_target": "cell_ontology_class",
"input_layer": None,
"reference_layer": None,
"n_estimators": 100,
"criterion": "gini",
"max_depth": None,
"class_weight": None,
"max_features": 200,
"output_compression": "gzip",
"reference_layer": None,
"output_obs_predictions": "random_forest_pred",
"output_obs_probability": "random_forest_probability"
}
meta = {"resources_dir": "src/annotate/svm"}
## VIASH END
sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
def setup_logger():
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(sys.stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()
def main():
logger.info("Reading input data")
input_mudata = mu.read_h5mu(par["input"])
input_modality = input_mudata.mod[par["modality"]].copy()
input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X
# Handle max_features parameter
max_features_conversion = {
"all": None,
"sqrt": "sqrt",
"log2": "log2",
}
try:
max_features = max_features_conversion.get(par["max_features"], int(par["max_features"]))
except ValueError:
raise ValueError(f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of \'sqrt\', \'log2\' or \'all\'")
if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]):
raise ValueError("Make sure to provide either 'model' or 'reference', but not both.")
if par["model"]:
logger.info("Loading a pre-trained model")
model = pickle.load(open(par["model"], "rb"))
elif par["reference"]:
logger.info("Reading reference data")
reference_mudata = mu.read_h5mu(par["reference"])
reference_modality = reference_mudata.mod[par["modality"]].copy()
reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X
logger.info("Training a model...")
labels = reference_modality.obs[par["reference_obs_target"]].to_numpy()
model = RandomForestClassifier(
n_estimators=par["n_estimators"],
criterion=par["criterion"],
max_depth=par["max_depth"],
class_weight=par["class_weight"] if not par["class_weight"] == "uniform" else None,
max_features=max_features
)
model.fit(reference_matrix, labels)
logger.info("Running predictions...")
predictions = model.predict(input_matrix)
probabilities = np.max(model.predict_proba(input_matrix), axis=1)
input_modality.obs[par["output_obs_predictions"]] = predictions
input_modality.obs[par["output_obs_probability"]] = probabilities
logger.info("Writing output data")
input_mudata.mod[par["modality"]] = input_modality
input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,185 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
from openpipelinetestutils.asserters import assert_annotation_objects_equal
import os
from sklearn.ensemble import RandomForestClassifier
import pickle
## VIASH START
meta = {
"resources_dir": "resources_test"
}
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
@pytest.fixture
def subset_genes(random_h5mu_path):
def wrapper(input_mudata_file, reference_mudata_file, modality):
input_mudata = mu.read_h5mu(input_mudata_file)
input_adata = input_mudata.mod[modality]
reference_mudata = mu.read_h5mu(reference_mudata_file)
reference_adata = reference_mudata.mod[modality]
reference_mudata.var["gene_symbol"] = list(reference_mudata.var.index)
reference_mudata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_mudata.var["ensemblid"]]
reference_adata.var["gene_symbol"] = list(reference_adata.var.index)
reference_adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_adata.var["ensemblid"]]
common_ens_ids = list(set(reference_adata.var.index).intersection(set(input_adata.var.index)))
reference = reference_adata[:, common_ens_ids].copy()
query = input_adata[:, common_ens_ids].copy()
input_mudata.mod[modality] = query
reference_mudata.mod[modality] = reference
subset_input_mudata_file = random_h5mu_path()
subset_reference_mudata_file = random_h5mu_path()
input_mudata.write_h5mu(subset_input_mudata_file)
reference_mudata.write_h5mu(subset_reference_mudata_file)
return subset_input_mudata_file, subset_reference_mudata_file
return wrapper
@pytest.fixture
def dummy_model(tmp_path, subset_genes):
_, subset_reference_file = subset_genes(input_file, reference_file, "rna")
reference_modality = mu.read_h5mu(subset_reference_file).mod["rna"]
labels = reference_modality.obs["cell_ontology_class"].to_numpy()
model = RandomForestClassifier()
model.fit(reference_modality.X, labels)
model_path = tmp_path / "model.pkl"
with open(model_path, "wb") as f:
pickle.dump(model, f)
return model_path
def test_simple_execution(run_component, random_h5mu_path, subset_genes):
subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
output_file = random_h5mu_path()
run_component([
"--input", subset_input_file,
"--reference", subset_reference_file,
"--reference_obs_target", "cell_ontology_class",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ['random_forest_pred',
'random_forest_probability']
obs_values = output_mudata.mod["rna"].obs["random_forest_probability"]
assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
def test_custom_out_obs_model_params(run_component, random_h5mu_path, subset_genes):
subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
output_file = random_h5mu_path()
run_component([
"--input", subset_input_file,
"--reference", subset_reference_file,
"--reference_obs_target", "cell_ontology_class",
"--output_obs_predictions", "dummy_pred",
"--output_obs_probability", "dummy_probability",
"--n_estimators", "10",
"--criterion", "entropy",
"--max_depth", "5",
"--class_weight", "balanced",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ['dummy_pred',
'dummy_probability']
obs_values = output_mudata.mod["rna"].obs["dummy_probability"]
assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
def test_with_model(run_component, random_h5mu_path, dummy_model, subset_genes):
subset_input_file, _ = subset_genes(input_file, reference_file, "rna")
output_file = random_h5mu_path()
run_component([
"--input", subset_input_file,
"--model", dummy_model,
"--output", output_file,
"--reference_obs_target", "cell_ontology_class"
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ['random_forest_pred',
'random_forest_probability']
obs_values = output_mudata.mod["rna"].obs["random_forest_probability"]
assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
def test_no_model_no_reference_error(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", input_file,
"--output", output_file,
"--reference_obs_target", "cell_ontology_class"
])
assert re.search(r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
err.value.stdout.decode('utf-8'))
def test_model_and_reference_error(run_component, random_h5mu_path, dummy_model, subset_genes):
output_file = random_h5mu_path()
subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", subset_input_file,
"--output", output_file,
"--reference", subset_reference_file,
"--reference_obs_target", "cell_ontology_class",
"--model", dummy_model,
])
assert re.search(r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
err.value.stdout.decode('utf-8'))
def test_invalid_max_features(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", input_file,
"--output", output_file,
"--reference_obs_target", "cell_ontology_class",
"--max_features", "invalid_value"
])
assert re.search(r"Invaldid value invalid_value for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'",
err.value.stdout.decode('utf-8'))
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,218 @@
name: scanvi
namespace: annotate
description: Semi-supervised model for single-cell transcriptomics data. A scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
description: Arguments related to the input (aka query) dataset.
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: Input h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: Reference h5mu file.
direction: input
required: true
example: reference.h5mu
- name: "--scvi_reference_model"
type: file
description: "Pretrained scvi reference model"
example: scvi_model.pt
direction: input
required: true
- name: "--reference_obs_label"
type: string
description: Key in obs field of reference AnnData with cell-type information.
example: "cell_ontology_class"
required: true
- name: SCANVI reference model training arguments
description: Arguments related to the reference SCANVI model.
arguments:
- name: "--reference_train_size"
type: double
description: Size of training set.
required: false
default: 0.9
min: 0.0
max: 1.0
- name: "--reference_max_epochs"
type: integer
description: Maximum number of epochs.
required: false
default: 400
- name: "--reference_learning_rate"
type: double
description: Learning rate.
required: false
default: 1e-3
- name: "--reference_reduce_lr_on_plateau"
type: boolean
description: Reduce learning rate on plateau.
required: false
default: true
- name: "--reference_lr_patience"
type: integer
description: Patience for learning rate reduction.
required: false
default: 25
- name: "--reference_lr_factor"
type: double
description: Factor by which to reduce learning rate.
required: false
default: 0.5
min: 0.0
max: 1.0
- name: "--reference_early_stopping"
type: boolean
description: Early stopping.
required: false
default: true
- name: "--reference_early_stopping_patience"
type: integer
description: Patience for early stopping.
required: false
default: 50
- name: SCANVI query model training arguments
description: Arguments related to the query SCANVI model.
arguments:
- name: "--query_train_size"
type: double
description: Size of training set.
required: false
default: 0.9
min: 0.0
max: 1.0
- name: "--query_max_epochs"
type: integer
description: Maximum number of epochs.
required: false
default: 400
- name: "--query_learning_rate"
type: double
description: Learning rate.
required: false
default: 1e-3
- name: "--query_reduce_lr_on_plateau"
type: boolean
description: Reduce learning rate on plateau.
required: false
default: true
- name: "--query_lr_patience"
type: integer
description: Patience for learning rate reduction.
required: false
default: 25
- name: "--query_lr_factor"
type: double
description: Factor by which to reduce learning rate.
required: false
default: 0.5
min: 0.0
max: 1.0
- name: "--query_early_stopping"
type: boolean
description: Early stopping.
required: false
default: true
- name: "--query_early_stopping_patience"
type: integer
description: Patience for early stopping.
required: false
default: 50
- name: Outputs
description: Arguments related to the output.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
required: true
example: output.h5mu
- name: "--output_compression"
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"
- name: "--output_model"
type: file
description: Folder where the state of the trained model will be saved to.
direction: output
required: false
example: model_dir
- name: "--output_obs_predictions"
type: string
description: |
In which `.obs` slots to store the predicted information.
default: "scanvi_pred"
required: false
- name: "--output_obs_probability"
type: string
default: "scanvi_probability"
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
- name: "--output_obsm_scanvi_embedding"
type: string
default: "scanvi_embedding"
required: false
description: |
In which `.obsm` slots to store the scvi embedding.
- name: "--unknown_celltype"
type: string
default: "Unknown"
required: false
description: |
Label for unknown cell types.
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [ /src/base/requirements/scanpy.yaml, .]
- type: python
packages:
- scvi-tools==1.1.5
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow

View File

@@ -0,0 +1,104 @@
import sys
import mudata as mu
import scvi
import numpy as np
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5ad",
"scvi_reference_model": "resources_test/annotation_test_data/scvi_model.pt",
"reference_obs_label": "cell_ontology_class",
}
meta = {}
## VIASH END
sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()
logger.info("Reading the input and reference data")
input_data = mu.read_h5mu(par["input"])
query = input_data.mod[par["modality"]]
reference_data = mu.read_h5mu(par["reference"])
reference = reference_data.mod[par["modality"]]
logger.info(f"Loading the pretrained scVI model from {par['scvi_reference_model']}")
scvi_reference_model = scvi.model.SCVI.load(par["scvi_reference_model"], reference)
logger.info("Setting up scANVI model")
scanvi_ref = scvi.model.SCANVI.from_scvi_model(
scvi_reference_model,
unlabeled_category=par["unknown_celltype"],
labels_key=par["reference_obs_label"],
)
reference_plan_kwargs = {"lr": par["reference_learning_rate"],
"reduce_lr_on_plateau": par['reference_reduce_lr_on_plateau'],
"lr_patience": par['reference_lr_patience'],
"lr_factor": par['reference_lr_factor']
}
logger.info("Training scANVI model on reference data with celltype labels")
scanvi_ref.train(
train_size=par["reference_train_size"],
max_epochs=par['reference_max_epochs'],
early_stopping=par['reference_early_stopping'],
early_stopping_patience=par['reference_early_stopping_patience'],
plan_kwargs=reference_plan_kwargs,
check_val_every_n_epoch=1,
accelerator="auto",
)
logger.info("Updating and training scANVI model with query data")
scvi.model.SCANVI.prepare_query_anndata(query, scanvi_ref, inplace=True)
scanvi_query = scvi.model.SCANVI.load_query_data(query, scanvi_ref)
query_plan_kwargs = {"lr": par["query_learning_rate"],
"reduce_lr_on_plateau": par['query_reduce_lr_on_plateau'],
"lr_patience": par['query_lr_patience'],
"lr_factor": par['query_lr_factor']
}
scanvi_query.train(
train_size=par["query_train_size"],
max_epochs=par['query_max_epochs'],
early_stopping=par['query_early_stopping'],
early_stopping_patience=par['query_early_stopping_patience'],
plan_kwargs=query_plan_kwargs,
check_val_every_n_epoch=1,
accelerator="auto",
)
logger.info("Adding latent representation to query data")
query.obsm[par["output_obsm_scanvi_embedding"]] = scanvi_query.get_latent_representation()
logger.info("Running predictions on query data")
query.obs[par["output_obs_predictions"]] = scanvi_query.predict(query)
query.obs[par["output_obs_probability"]] = np.max(scanvi_query.predict(query, soft=True), axis=1)
logger.info("Saving output and model")
input_data.mod[par["modality"]] = query
input_data.write_h5mu(par["output"], compression=par["output_compression"])
if par["output_model"]:
scanvi_query.save(par["output_model"], overwrite=True)

142
src/annotate/scanvi/test.py Normal file
View File

@@ -0,0 +1,142 @@
import sys
import os
import pytest
import re
import mudata as mu
import anndata as ad
from openpipelinetestutils.asserters import assert_annotation_objects_equal
import scvi
import os
## VIASH START
meta = {
"resources_dir": "resources_test"
}
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
@pytest.fixture
def create_scvi_model(random_path, tmp_path):
def wrapper(input_file, reference_file):
input_data = mu.read_h5mu(input_file)
input_modality = input_data.mod["rna"]
reference_data = mu.read_h5mu(reference_file)
reference_modality = reference_data.mod["rna"]
reference_data.var["gene_symbol"] = list(reference_data.var.index)
reference_data.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_data.var["ensemblid"]]
reference_modality.var["gene_symbol"] = list(reference_modality.var.index)
reference_modality.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_modality.var["ensemblid"]]
common_ens_ids = list(set(reference_modality.var.index).intersection(set(input_modality.var.index)))
reference = reference_modality[:, common_ens_ids].copy()
query = input_modality[:, common_ens_ids].copy()
scvi.model.SCVI.setup_anndata(reference,
labels_key="cell_ontology_class"
)
scvi_model = scvi.model.SCVI(
reference,
use_layer_norm="both",
use_batch_norm="none",
encode_covariates=True,
dropout_rate=0.2,
n_layers=1,
)
scvi_model.train(max_epochs=10)
input_data.mod["rna"] = query
reference_data.mod["rna"] = reference
input_data_file = random_path(extension="h5mu")
reference_file = random_path(extension="h5mu")
scvi_model_file = tmp_path
input_data.write_h5mu(input_data_file)
reference_data.write_h5mu(reference_file)
scvi_model.save(scvi_model_file, overwrite=True)
return scvi_model_file, input_data_file, reference_file
return wrapper
def test_simple_execution(run_component, random_h5mu_path, create_scvi_model):
scvi_model_file, input_file_scvi, reference_file_scvi = create_scvi_model(input_file, reference_file)
output_file = random_h5mu_path()
run_component([
"--input", input_file_scvi,
"--reference", reference_file_scvi,
"--scvi_reference_model", scvi_model_file,
"--reference_obs_label", "cell_ontology_class",
"--reference_max_epochs", "10",
"--query_max_epochs", "10",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file_scvi)
output_mudata = mu.read_h5mu(output_file)
assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added"
assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added"
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
def test_multiple_arguments(run_component, random_h5mu_path, create_scvi_model, tmp_path):
scvi_model_file, input_file_scvi, reference_file_scvi = create_scvi_model(input_file, reference_file)
output_file = random_h5mu_path()
run_component([
"--input", input_file_scvi,
"--reference", reference_file_scvi,
"--scvi_reference_model", scvi_model_file,
"--reference_obs_label", "cell_ontology_class",
"--output", output_file,
"--reference_max_epochs", "10",
"--reference_reduce_lr_on_plateau", "True",
"--reference_lr_patience", "5",
"--reference_lr_factor", "0.5",
"--reference_train_size", "0.8",
"--reference_early_stopping", "True",
"--reference_early_stopping_patience", "5",
"--reference_early_stopping_min_delta", "0.01",
"--query_max_epochs", "10",
"--query_reduce_lr_on_plateau", "True",
"--query_lr_patience", "5",
"--query_lr_factor", "0.5",
"--query_train_size", "0.8",
"--query_early_stopping", "True",
"--query_early_stopping_patience", "5",
"--query_early_stopping_min_delta", "0.01",
"--output_obs_predictions", "scanvi_pred",
"--output_obs_probabilities", "scanvi_probabilitity",
"--output_compression", "gzip",
"--output_model", tmp_path
])
assert os.path.exists(output_file), "Output file does not exist"
assert os.path.exists(tmp_path / "model.pt"), "Model file does not exist"
input_mudata = mu.read_h5mu(input_file_scvi)
output_mudata = mu.read_h5mu(output_file)
assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added"
assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added"
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,126 @@
name: svm_annotation
namespace: annotate
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data to be used for cell type annotation if .X is not to be used.
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
required: false
- name: "--reference_obs_target"
type: string
description:
required: true
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_compression"
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"
- name: "--output_obs_prediction"
type: string
default: svm_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: svm_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--feature_selection"
type: boolean
description: "Whether to perform feature selection."
default: true
- name: "--max_iter"
type: integer
description: "Maximum number of iterations for the SVM."
min: 1
default: 5000
- name: "--c_reg"
type: double
description: "Regularization parameter for the SVM."
min: 0.0
default: 1.0
- name: "--class_weight"
type: string
description: |
"Class weights for the SVM. The `uniform` mode gives all classes a weight of one.
The `balanced` mode (default) uses the values of y to automatically adjust weights inversely
proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))"
choices: ["balanced", "uniform"]
default: "balanced"
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- libhdf5-dev
- procps
- type: python
packages:
- scikit-learn==1.5.2
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow

View File

@@ -0,0 +1,94 @@
import sys
import logging
import mudata as mu
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
from sklearn import svm
import pickle
import re
## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
"output": "output.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
"model": None,
"reference_obs_target": "cell_ontology_class",
"input_layer": None,
"reference_layer": None,
"max_iter": 5000,
"c_reg": 1,
"class_weight": "balanced",
"output_compression": "gzip",
"var_query_gene_names": None,
"var_reference_gene_names": "ensemblid",
"reference_layer": None,
"output_obs_prediction": "svm_pred",
"output_obs_probability": "svm_probability",
}
meta = {"resources_dir": "src/annotate/svm"}
## VIASH END
sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
def setup_logger():
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(sys.stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()
def main():
if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]):
raise ValueError("Make sure to provide either 'model' or 'reference', but not both.")
logger.info("Reading input data")
input_mudata = mu.read_h5mu(par["input"])
input_modality = input_mudata.mod[par["modality"]].copy()
input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X
if par["model"]:
logger.info("Loading a pre-trained model")
model = pickle.load(open(par["model"], "rb"))
elif par["reference"]:
logger.info("Reading reference data")
reference_mudata = mu.read_h5mu(par["reference"])
reference_modality = reference_mudata.mod[par["modality"]].copy()
reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X
logger.info("Training a model...")
labels = reference_modality.obs[par["reference_obs_target"]].to_numpy()
model = CalibratedClassifierCV(svm.LinearSVC(
C=par["c_reg"],
max_iter=par["max_iter"],
class_weight=par["class_weight"] if not par["class_weight"]=="uniform" else None,
dual="auto",
))
model.fit(reference_matrix, labels)
logger.info("Running predictions...")
predictions = model.predict(input_matrix)
probabilities = np.max(model.predict_proba(input_matrix), axis=1)
input_modality.obs[par["output_obs_prediction"]] = predictions
input_modality.obs[par["output_obs_probability"]] = probabilities
logger.info("Writing output data")
input_mudata.mod[par["modality"]] = input_modality
input_mudata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,164 @@
import sys
import os
import pytest
import subprocess
import re
import mudata as mu
from openpipelinetestutils.asserters import assert_annotation_objects_equal
import os
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
import pickle
## VIASH START
meta = {
"resources_dir": "resources_test"
}
## VIASH END
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
# model_file = f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model"
@pytest.fixture
def subset_genes(random_h5mu_path):
def wrapper(input_mudata_file, reference_mudata_file, modality):
input_mudata = mu.read_h5mu(input_mudata_file)
input_adata = input_mudata.mod[modality]
reference_mudata = mu.read_h5mu(reference_mudata_file)
reference_adata = reference_mudata.mod[modality]
reference_mudata.var["gene_symbol"] = list(reference_mudata.var.index)
reference_mudata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_mudata.var["ensemblid"]]
reference_adata.var["gene_symbol"] = list(reference_adata.var.index)
reference_adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_adata.var["ensemblid"]]
common_ens_ids = list(set(reference_adata.var.index).intersection(set(input_adata.var.index)))
reference = reference_adata[:, common_ens_ids].copy()
query = input_adata[:, common_ens_ids].copy()
input_mudata.mod[modality] = query
reference_mudata.mod[modality] = reference
subset_input_mudata_file = random_h5mu_path()
subset_reference_mudata_file = random_h5mu_path()
input_mudata.write_h5mu(subset_input_mudata_file)
reference_mudata.write_h5mu(subset_reference_mudata_file)
return subset_input_mudata_file, subset_reference_mudata_file
return wrapper
@pytest.fixture
def dummy_model(tmp_path, subset_genes):
_, subset_reference_file = subset_genes(input_file, reference_file, "rna")
reference_modality = mu.read_h5mu(subset_reference_file).mod["rna"]
labels = reference_modality.obs["cell_ontology_class"].to_numpy()
model = CalibratedClassifierCV(svm.LinearSVC(
max_iter=10,
dual="auto",
))
model.fit(reference_modality.X, labels)
model_path = tmp_path / "model.pkl"
with open(model_path, "wb") as f:
pickle.dump(model, f)
return model_path
def test_simple_execution(run_component, random_h5mu_path, subset_genes):
subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
output_file = random_h5mu_path()
run_component([
"--input", subset_input_file,
"--reference", subset_reference_file,
"--reference_obs_target", "cell_ontology_class",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ['svm_pred',
'svm_probability']
obs_values = output_mudata.mod["rna"].obs["svm_probability"]
assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
def test_custom_out_obs_model_params(run_component, random_h5mu_path, subset_genes):
subset_input_file, subset_reference_file = subset_genes(input_file, reference_file, "rna")
output_file = random_h5mu_path()
run_component([
"--input", subset_input_file,
"--reference", subset_reference_file,
"--reference_obs_target", "cell_ontology_class",
"--output_obs_prediction", "dummy_pred",
"--output_obs_probability", "dummy_probability",
"--max_iter", "1000",
"--c_reg", "0.1",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ['dummy_pred',
'dummy_probability']
obs_values = output_mudata.mod["rna"].obs["dummy_probability"]
assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
def test_with_model(run_component, random_h5mu_path, dummy_model, subset_genes):
subset_input_file, _ = subset_genes(input_file, reference_file, "rna")
output_file = random_h5mu_path()
run_component([
"--input", subset_input_file,
"--reference_obs_target", "cell_ontology_class",
"--model", dummy_model,
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file)
output_mudata = mu.read_h5mu(output_file)
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
assert list(output_mudata.mod["rna"].obs.keys()) == ['svm_pred',
'svm_probability']
obs_values = output_mudata.mod["rna"].obs["svm_probability"]
assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]"
def test_no_model_no_reference_error(run_component, random_h5mu_path):
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", input_file,
"--reference_obs_target", "cell_ontology_class",
"--output", output_file,
])
assert re.search(r"ValueError: Make sure to provide either 'model' or 'reference', but not both.",
err.value.stdout.decode('utf-8'))
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,14 @@
name: Angela Oliveira Pisco
info:
role: Contributor
links:
github: aopisco
orcid: "0000-0003-0142-2355"
linkedin: aopisco
organizations:
- name: Insitro
href: https://insitro.com
role: Director of Computational Biology
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,11 @@
name: Dorien Roosen
info:
role: Core Team Member
links:
email: dorien@data-intuitive.com
github: dorien-er
linkedin: dorien-roosen
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist

View File

@@ -0,0 +1,11 @@
name: Dries De Maeyer
info:
role: Core Team Member
links:
email: ddemaeyer@gmail.com
github: ddemaeyer
linkedin: dries-de-maeyer-b46a814
organizations:
- name: Janssen Pharmaceuticals
href: https://www.janssen.com
role: Principal Scientist

View File

@@ -0,0 +1,12 @@
name: Dries Schaumont
info:
role: Core Team Member
links:
email: dries@data-intuitive.com
github: DriesSchaumont
orcid: "0000-0002-4389-0440"
linkedin: dries-schaumont
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist

View File

@@ -0,0 +1,6 @@
name: Elizabeth Mlynarski
info:
role: Contributor
organizations:
- name: Janssen R&D US
role: Principal Scientist Computational Genomics

View File

@@ -0,0 +1,10 @@
name: Isabelle Bergiers
info:
role: Contributor
links:
github: Isabelle-b
orcid: 0000-0001-9622-7960
organizations:
- name: Janssen Pharmaceuticals
href: https://www.janssen.com
role: Scientist OMICS Technology

View File

@@ -0,0 +1,11 @@
name: Jakub Majercik
info:
role: Contributor
links:
email: jakub@data-intuitive.com
github: jakubmajercik
linkedin: jakubmajercik
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Bioinformatics Engineer

View File

@@ -0,0 +1,15 @@
name: Kai Waldrant
info:
role: Contributor
links:
email: kai@data-intuitive.com
github: KaiWaldrant
orcid: "0009-0003-8555-1361"
linkedin: kaiwaldrant
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Bioinformatician
- name: Open Problems
href: https://openproblems.bio
role: Contributor

View File

@@ -0,0 +1,16 @@
name: Malte D. Luecken
info:
role: Core Team Member
links:
email: malte.luecken@helmholtz-muenchen.de
github: LuckyMD
orcid: "0000-0001-7464-7921"
linkedin: malte-l%C3%BCcken-b8b21049
twitter: MDLuecken
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: Group Leader
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,11 @@
name: Marijke Van Moerbeke
info:
role: Contributor
links:
github: mvanmoerbeke
orcid: 0000-0002-3097-5621
linkedin: marijke-van-moerbeke-84303a34
organizations:
- name: OpenAnalytics
href: https://www.openanalytics.eu
role: Statistical Consultant

View File

@@ -0,0 +1,12 @@
name: Matthias Beyens
info:
role: Contributor
links:
github: MatthiasBeyens
orcid: "0000-0003-3304-0706"
email: matthias.beyens@gmail.com
linkedin: mbeyens
organizations:
- name: Janssen Pharmaceuticals
href: https://www.janssen.com
role: Principal Scientist

View File

@@ -0,0 +1,11 @@
name: Mauro Saporita
info:
role: Contributor
links:
email: maurosaporita@gmail.com
github: mauro-saporita
linkedin: mauro-saporita-930b06a5
organizations:
- name: Ardigen
href: https://ardigen.com
role: Lead Nextflow Developer

View File

@@ -0,0 +1,11 @@
name: Povilas Gibas
info:
role: Contributor
links:
email: povilasgibas@gmail.com
github: PoGibas
linkedin: povilas-gibas
organizations:
- name: Ardigen
href: https://ardigen.com
role: Bioinformatician

View File

@@ -0,0 +1,15 @@
name: Robrecht Cannoodt
info:
role: Core Team Member
links:
email: robrecht@data-intuitive.com
github: rcannood
orcid: "0000-0003-3641-729X"
linkedin: robrechtcannoodt
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Science Engineer
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,10 @@
name: Samuel D'Souza
info:
role: Contributor
links:
github: srdsam
linkedin: samuel-d-souza-887023150/
organizations:
- name: Chan Zuckerberg Biohub
href: https://www.czbiohub.org
role: Data Engineer

View File

@@ -0,0 +1,10 @@
name: Sarah Ouologuem
info:
role: Contributor
links:
github: SarahOuologuem
orcid: 0009-0005-3398-1700
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: Student Assistant

View File

@@ -0,0 +1,10 @@
name: Toni Verbeiren
info:
role: Core Team Member
links:
github: tverbeiren
linkedin: verbeiren
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist and CEO

View File

@@ -0,0 +1,12 @@
name: Vladimir Shitov
info:
role: Contributor
links:
email: vladimir.shitov@helmholtz-muenchen.de
github: vladimirshitov
orcid: "0000-0002-1960-8812"
linkedin: vladimir-shitov-9a659513b
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: PhD Candidate

View File

@@ -0,0 +1,6 @@
name: Weiwei Schultz
info:
role: Contributor
organizations:
- name: Janssen R&D US
role: Associate Director Data Sciences

View File

@@ -0,0 +1,11 @@
name: Xichen Wu
info:
role: Contributor
links:
github: wxicu
linkedin: xichen-wu
orcid: 0009-0008-2168-4508
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: Student Assistant

View File

@@ -0,0 +1,5 @@
__pycache__/
build
eggs/
*.egg
*.egg-info/

View File

@@ -0,0 +1,240 @@
import mudata
import anndata
import pandas as pd
import numpy as np
from scipy.sparse import issparse, spmatrix
from mudata import MuData
from pathlib import Path
from pandas.testing import assert_frame_equal
from typing import Literal
from .typing import AnnotationObjectOrPathLike
from functools import singledispatch
def _read_if_needed(anndata_mudata_path_or_obj):
if isinstance(anndata_mudata_path_or_obj, (str, Path)):
return mudata.read(str(anndata_mudata_path_or_obj)) # TODO: remove when mudata fixes PAth bug
if isinstance(anndata_mudata_path_or_obj, (mudata.MuData, anndata.AnnData)):
return anndata_mudata_path_or_obj.copy()
raise AssertionError("Expected 'Path', 'str' to MuData/AnnData "
"file or MuData/AnnData object.")
def _assert_same_annotation_object_class(left, right):
assert type(left) == type(right), (f"Two objects are not of the same class:"
f"\n[Left]:{type(left)}\n[right]:{type(right)}")
def _promote_dtypes(left, right):
# Create new DataFrames to avoid modifying the original ones
left_aligned = left.copy()
right_aligned = right.copy()
for column in left.columns:
l_dtype = left[column].dtype
r_dtype = right[column].dtype
if l_dtype == r_dtype:
# No need to modify dtypes that are already the same
continue
if not all(map(pd.api.types.is_any_real_numeric_dtype, (r_dtype, l_dtype))):
# Do not try casting without dtypes that do not represent real numbers
continue
is_extension = pd.api.types.is_extension_array_dtype(l_dtype)
if is_extension and not pd.api.types.is_extension_array_dtype(r_dtype):
continue
numpy_dtype_l = l_dtype.type if is_extension else l_dtype
numpy_dtype_r = r_dtype.type if is_extension else r_dtype
# At this point we should have only integer or float dtypes
common_dtype = np.promote_types(numpy_dtype_l, numpy_dtype_r)
if is_extension:
left_aligned[column] = pd.array(left[column], dtype=common_dtype)
right_aligned[column] = pd.array(right[column], dtype=common_dtype)
else:
left_aligned[column] = left[column].astype(common_dtype)
right_aligned[column] = right[column].astype(common_dtype)
return left_aligned, right_aligned
def assert_mudata_modality_keys_equal(left, right):
left_keys = set(left.mod.keys())
right_keys = set(right.mod.keys())
if left_keys!= right_keys:
raise AssertionError("MuData modalities differ:"
f"\n[left]:{left_keys}\n[right]:{right_keys}")
def assert_shape_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike):
left, right = _read_if_needed(left), _read_if_needed(right)
_assert_same_annotation_object_class(left, right)
if left.shape != right.shape:
raise AssertionError(f"{type(left).__name__} shapes differ:"
f"\n[left]:{left.shape}\n[right]:{right.shape}")
if isinstance(left, MuData):
assert_mudata_modality_keys_equal(left, right)
for mod_name, modality in left.mod.items():
assert_shape_equal(modality, right[mod_name])
def assert_obs_names_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike,
*args, **kwargs):
left, right = _read_if_needed(left), _read_if_needed(right)
_assert_same_annotation_object_class(left, right)
pd.testing.assert_index_equal(left.obs_names, right.obs_names, *args, **kwargs)
if isinstance(left, MuData):
assert_mudata_modality_keys_equal(left, right)
for mod_name, modality in left.mod.items():
assert_obs_names_equal(modality, right[mod_name])
def assert_var_names_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike,
*args, **kwargs):
left, right = _read_if_needed(left), _read_if_needed(right)
_assert_same_annotation_object_class(left, right)
pd.testing.assert_index_equal(left.var_names, right.var_names, *args, **kwargs)
if isinstance(left, MuData):
assert_mudata_modality_keys_equal(left, right)
for mod_name, modality in left.mod.items():
assert_var_names_equal(modality, right[mod_name])
def _assert_frame_equal(left, right, sort=False, promote_precicion=False, *args, **kwargs):
if sort:
left, right = left.sort_index(inplace=False), right.sort_index(inplace=False)
left, right = left.sort_index(axis=1, inplace=False), right.sort_index(axis=1, inplace=False)
if promote_precicion:
left, right = _promote_dtypes(left, right)
assert_frame_equal(left, right, check_exact=False, atol=1e-3, *args, **kwargs)
else:
assert_frame_equal(left, right, *args, **kwargs)
def assert_annotation_frame_equal(annotation_attr: Literal["obs", "var"],
left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike,
sort=False,
promote_precicion=False,
*args, **kwargs):
if not annotation_attr in ("obs", "var"):
raise ValueError("annotation_attr should be 'obs', or 'var'")
left, right = _read_if_needed(left), _read_if_needed(right)
_assert_same_annotation_object_class(left, right)
left_frame, right_frame = getattr(left, annotation_attr), getattr(right, annotation_attr)
_assert_frame_equal(left_frame, right_frame, sort=sort, promote_precicion=promote_precicion, *args, **kwargs)
if isinstance(left, MuData):
assert_mudata_modality_keys_equal(left, right)
for mod_name, modality in left.mod.items():
assert_annotation_frame_equal(annotation_attr, modality,
right[mod_name], sort=sort, promote_precicion=promote_precicion, *args, **kwargs)
def _assert_layer_equal(left, right):
if issparse(left):
if not issparse(right):
raise AssertionError("Layers differ:\n[left]: sparse\n[right]: not sparse")
if left.getformat() != right.getformat():
raise AssertionError("Layers format differ:"
f"\n[left]:{left.getformat()}\n[right]: {right.getformat()}")
assert np.all(left.indices == right.indices), "Layers differ: indices are not the same"
assert np.all(left.indptr == right.indptr), "Layers differ: index pointers are not the same"
np.testing.assert_allclose(left.data, right.data, rtol=1e-5,
err_msg="Layers data differs.", equal_nan=True)
else:
if issparse(right):
raise AssertionError("Layers differ:\n[left]: not sparse\n[right]: sparse")
np.testing.assert_allclose(left, right,
rtol=1e-5,
err_msg="Layers data differs.",
equal_nan=True)
def assert_layers_equal(left: AnnotationObjectOrPathLike,
right: AnnotationObjectOrPathLike):
left, right = _read_if_needed(left), _read_if_needed(right)
_assert_same_annotation_object_class(left, right)
if left.raw is not None:
try:
_assert_layer_equal(left.raw, right.raw)
except AssertionError as e:
e.add_note(".raw is different")
raise
else:
if right.raw:
raise AssertionError("Layer .raw differs: "
f"\n[left]:{left.raw}\n[right]:{right}")
if left.X is not None:
try:
_assert_layer_equal(left.X, right.X)
except AssertionError as e:
e.add_note("X is different.")
raise
if left.layers:
assert right.layers and (left.layers.keys() == right.layers.keys()), \
"Avaiable layers differ:" \
f"\n[left]:{left.layers}\n[right]{right.layers}"
for layer_name, layer in left.layers.items():
try:
_assert_layer_equal(layer, right.layers[layer_name])
except AssertionError as e:
e.add_note(f"Layer {layer_name} is different")
raise
if isinstance(left, MuData):
assert_mudata_modality_keys_equal(left, right)
for mod_name, modality in left.mod.items():
assert_layers_equal(modality, right[mod_name])
def assert_multidimensional_annotation_equal(annotation_attr: Literal["obsm", "varm"],
left, right, sort=False):
if not annotation_attr in ("obsm", "varm"):
raise ValueError("annotation_attr should be 'obsm', or 'varm'")
left, right = _read_if_needed(left), _read_if_needed(right)
_assert_same_annotation_object_class(left, right)
@singledispatch
def _assert_multidimensional_value_equal(left, right, **kwargs):
raise NotImplementedError("Unregistered type found while asserting")
@_assert_multidimensional_value_equal.register
def _(left: pd.DataFrame, right, **kwargs):
_assert_frame_equal(left, right, **kwargs)
@_assert_multidimensional_value_equal.register(np.ndarray)
@_assert_multidimensional_value_equal.register(spmatrix)
def _(left, right, **kwargs):
# Cannot sort sparse and dense matrices so ignore sort param
_assert_layer_equal(left, right)
left_dict, right_dict = getattr(left, annotation_attr), getattr(right, annotation_attr)
left_keys, right_keys = left_dict.keys(), right_dict.keys()
assert left_keys == right_keys, f"Keys of {annotation_attr} differ:\n[left]:{left_keys}\n[right]:{right_keys}"
for left_key, left_value in left_dict.items():
try:
_assert_multidimensional_value_equal(left_value, right_dict[left_key], sort=sort)
except AssertionError as e:
e.add_note(f"Failing key: {left_key}")
raise
if isinstance(left, MuData):
assert_mudata_modality_keys_equal(left, right)
for mod_name, modality in left.mod.items():
try:
assert_multidimensional_annotation_equal(annotation_attr ,modality, right[mod_name], sort=sort)
except AssertionError as e:
e.add_note(f"Failing modality: {mod_name}")
raise
def assert_annotation_objects_equal(left: AnnotationObjectOrPathLike,
right: AnnotationObjectOrPathLike,
check_data=True,
sort=True,
promote_precision=False):
left, right = _read_if_needed(left), _read_if_needed(right)
_assert_same_annotation_object_class(left, right)
assert_shape_equal(left, right)
assert_annotation_frame_equal("obs", left, right, sort=sort, promote_precicion=promote_precision)
assert_annotation_frame_equal("var", left, right, sort=sort, promote_precicion=promote_precision)
for slot in ("varm", "obsm"):
try:
assert_multidimensional_annotation_equal(slot, left, right, sort=sort)
except AssertionError as e:
e.add_note(f"Failing multidimensional slot: {slot}")
raise
if check_data:
assert_layers_equal(left, right)

View File

@@ -0,0 +1,13 @@
import importlib
import pytest
from pathlib import Path
def pytest_collect_file(file_path: Path, parent):
if (file_path.name == ".viash_script.sh"):
# Allow file ending in .sh to be imported
importlib.machinery.SOURCE_SUFFIXES.append('.viash_script.sh')
return pytest.Module.from_parent(parent, path=file_path)
def pytest_collection_finish(session):
importlib.machinery.SOURCE_SUFFIXES.remove('.viash_script.sh')

View File

@@ -0,0 +1,63 @@
from uuid import uuid4
import pytest
import pandas as pd
import anndata as ad
import mudata as md
@pytest.fixture
def random_path(tmp_path):
def wrapper(extension=None):
extension = "" if not extension else f".{extension}"
return tmp_path / f"{uuid4()}{extension}"
return wrapper
@pytest.fixture
def random_h5mu_path(random_path):
def wrapper():
return random_path(extension="h5mu")
return wrapper
@pytest.fixture
def write_mudata_to_file(random_h5mu_path):
def wrapper(mudata_obj):
output_path = random_h5mu_path()
mudata_obj.write(output_path)
return output_path
return wrapper
@pytest.fixture
def small_anndata_1():
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"])
obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"])
var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"])
ad1 = ad.AnnData(df, obs=obs, var=var)
return ad1
@pytest.fixture
def small_anndata_2():
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var4", "var5", "var6"])
obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"])
var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"])
ad2 = ad.AnnData(df, obs=obs2, var=var2)
return ad2
@pytest.fixture
def small_mudata(small_anndata_1, small_anndata_2):
return md.MuData({'mod1': small_anndata_1, 'mod2': small_anndata_2})
@pytest.fixture
def small_mudata_path(small_mudata, write_mudata_to_file):
return write_mudata_to_file(small_mudata)
@pytest.fixture
def split_small_mudata_path(small_mudata_mod1_path, small_mudata_mod2_path):
return small_mudata_mod1_path, small_mudata_mod2_path
@pytest.fixture
def small_mudata_mod1_path(small_mudata, write_mudata_to_file):
return write_mudata_to_file(md.MuData({'mod1': small_mudata.mod['mod1']}))
@pytest.fixture
def small_mudata_mod2_path(small_mudata, write_mudata_to_file):
return write_mudata_to_file(md.MuData({'mod2': small_mudata.mod['mod2']}))

View File

@@ -0,0 +1,5 @@
[build-system]
requires = [
"setuptools >= 40.9.0",
]
build-backend = "setuptools.build_meta"

View File

@@ -0,0 +1,22 @@
[metadata]
name = openpipelinetestutils
author = Dries Schaumont
author_email = dries@data-intuitive.com
maintainer = Dries Schaumont
maintainer_email = dries@data-intuitive.com
description = Various test utilities for openpipeline.
license = MIT
[options]
python_requires = >=3.8
install_requires =
pytest >= 6.2
mudata~=0.2.3
pandas!=2.1.2
anndata~=0.9.1
package_dir=
openpipelinetestutils = .
[options.entry_points]
pytest11 =
openpipelineutils = openpipelinetestutils.fixtures

View File

@@ -0,0 +1,7 @@
from typing import Union
from mudata import MuData
from anndata import AnnData
from pathlib import Path
AnnotationObject = Union[MuData, AnnData]
AnnotationObjectOrPathLike = Union[AnnotationObject, str, Path]

View File

@@ -0,0 +1,60 @@
from .typing import AnnotationObject
from typing import Union, Literal
from functools import reduce
from operator import attrgetter
from anndata import AnnData
from mudata import MuData
from itertools import product
def remove_annotation_column(annotation_object: AnnotationObject,
column_names: list[str] | str,
axis: Union[Literal["obs"], Literal["var"], 0, 1],
modality_name: str | None = None):
if isinstance(annotation_object, AnnData) and modality_name is not None:
raise ValueError("Cannot specify modality when object is of type AnnData.")
if isinstance(column_names, str):
column_names = [str(column_names)] # str to make a copy
axis_strings = {
"var": "var",
"obs": "obs",
0: "obs",
1: "var"
}
axis_string = axis_strings[axis]
axis_getter = attrgetter(axis_string)
axis_setter = lambda obj, value: setattr(obj, axis_string, value)
if not modality_name:
axis_setter(annotation_object, axis_getter(annotation_object).drop(column_names,
axis="columns",
inplace=False))
def _get_columns_in_all_modalities(annotation_object, axis_string: str):
return reduce(
lambda a, b: a.intersection(b),
[getattr(annotation_object.mod[mod], axis_string).columns
for mod in annotation_object.mod],
).to_list()
if isinstance(annotation_object, MuData):
if not annotation_object.axis == 0:
raise ValueError("This function was designed for mudata objects with .axis=0")
modality_names = [modality_name] if modality_name else list(annotation_object.mod.keys())
global_columns = _get_columns_in_all_modalities(annotation_object, axis_string) \
if axis_string == "var" else []
extra_cols_to_remove = [f"{mod_name}:{column_name}" for mod_name, column_name
in product(modality_names, column_names)
if column_name not in global_columns]
extra_cols_to_remove += [column_name for column_name in column_names
if column_name in global_columns]
if modality_name:
axis_setter(annotation_object, axis_getter(annotation_object).drop(extra_cols_to_remove,
axis="columns",
inplace=False))
for mod_name in modality_names:
modality = annotation_object.mod[mod_name]
new_modality = remove_annotation_column(modality, column_names,
axis=axis, modality_name=None)
annotation_object.mod[mod_name] = new_modality
return annotation_object

View File

@@ -0,0 +1,3 @@
packages:
- anndata==0.10.8

View File

@@ -0,0 +1,5 @@
__merge__: [/src/base/requirements/anndata.yaml, .]
packages:
- mudata~=0.2.4
- pandas!=2.1.2
- numpy<2.0.0

View File

@@ -0,0 +1,7 @@
test_setup:
- type: docker
copy: ["openpipelinetestutils /opt/openpipelinetestutils"]
- type: python
packages: /opt/openpipelinetestutils
- type: python
__merge__: /src/base/requirements/viashpy.yaml

View File

@@ -0,0 +1,3 @@
packages:
- scanpy~=1.9.6

Some files were not shown because too many files have changed in this diff Show More